Merge branch 'master' into devel/sphinx

author: Martin Liska <mliska@suse.cz> 2022-06-17 13:05:50 +0200
committer: Martin Liska <mliska@suse.cz> 2022-06-17 13:05:50 +0200
commit: 910ef4ff32f3a53dbd12445e1eb8c5349d047140 (patch)
tree: ef0772e400ad08ab2bef318992cc2c1bf6fb5b7e /gcc/config
parent: cad2e08f6c249937e10ad5ae0d4a117923979efb (diff)
parent: 94018fd2675190a4353cb199da4957538f070886 (diff)
download: gcc-910ef4ff32f3a53dbd12445e1eb8c5349d047140.zip
gcc-910ef4ff32f3a53dbd12445e1eb8c5349d047140.tar.gz
gcc-910ef4ff32f3a53dbd12445e1eb8c5349d047140.tar.bz2
176 files changed, 26335 insertions, 3488 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index 5217dbd..e0a741a 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -716,6 +716,7 @@ static GTY(()) struct aarch64_simd_type_info aarch64_simd_types [] = {
 };
 #undef ENTRY
 
+static machine_mode aarch64_simd_tuple_modes[ARM_NEON_H_TYPES_LAST][3];
 static GTY(()) tree aarch64_simd_tuple_types[ARM_NEON_H_TYPES_LAST][3];
 
 static GTY(()) tree aarch64_simd_intOI_type_node = NULL_TREE;
@@ -831,7 +832,7 @@ aarch64_lookup_simd_builtin_type (machine_mode mode,
 				  enum aarch64_type_qualifiers q)
 {
   int i;
-  int nelts = sizeof (aarch64_simd_types) / sizeof (aarch64_simd_types[0]);
+  int nelts = ARRAY_SIZE (aarch64_simd_types);
 
   /* Non-poly scalar modes map to standard types not in the table.  */
   if (q != qualifier_poly && !VECTOR_MODE_P (mode))
@@ -844,7 +845,7 @@ aarch64_lookup_simd_builtin_type (machine_mode mode,
 	return aarch64_simd_types[i].itype;
       if (aarch64_simd_tuple_types[i][0] != NULL_TREE)
 	for (int j = 0; j < 3; j++)
-	  if (TYPE_MODE (aarch64_simd_tuple_types[i][j]) == mode
+	  if (aarch64_simd_tuple_modes[i][j] == mode
 	      && aarch64_simd_types[i].q == q)
 	    return aarch64_simd_tuple_types[i][j];
     }
@@ -868,7 +869,7 @@ static void
 aarch64_init_simd_builtin_types (void)
 {
   int i;
-  int nelts = sizeof (aarch64_simd_types) / sizeof (aarch64_simd_types[0]);
+  int nelts = ARRAY_SIZE (aarch64_simd_types);
   tree tdecl;
 
   /* Init all the element types built by the front-end.  */
@@ -1297,8 +1298,10 @@ register_tuple_type (unsigned int num_vectors, unsigned int type_index)
     }
 
   unsigned int alignment
-	= (known_eq (GET_MODE_SIZE (type->mode), 16) ? 128 : 64);
-  gcc_assert (TYPE_MODE_RAW (array_type) == TYPE_MODE (array_type)
+    = known_eq (GET_MODE_SIZE (type->mode), 16) ? 128 : 64;
+  machine_mode tuple_mode = TYPE_MODE_RAW (array_type);
+  gcc_assert (VECTOR_MODE_P (tuple_mode)
+	      && TYPE_MODE (array_type) == tuple_mode
 	      && TYPE_ALIGN (array_type) == alignment);
 
   tree field = build_decl (input_location, FIELD_DECL,
@@ -1309,14 +1312,13 @@ register_tuple_type (unsigned int num_vectors, unsigned int type_index)
 						  make_array_slice (&field,
 								    1));
   gcc_assert (TYPE_MODE_RAW (t) == TYPE_MODE (t)
-	      && TYPE_ALIGN (t) == alignment);
-
-  if (num_vectors == 2)
-    aarch64_simd_tuple_types[type_index][0] = t;
-  else if (num_vectors == 3)
-    aarch64_simd_tuple_types[type_index][1] = t;
-  else if (num_vectors == 4)
-    aarch64_simd_tuple_types[type_index][2] = t;
+	      && (flag_pack_struct
+		  || maximum_field_alignment
+		  || (TYPE_MODE_RAW (t) == tuple_mode
+		      && TYPE_ALIGN (t) == alignment)));
+
+  aarch64_simd_tuple_modes[type_index][num_vectors - 2] = tuple_mode;
+  aarch64_simd_tuple_types[type_index][num_vectors - 2] = t;
 }
 
 static bool
@@ -1325,10 +1327,31 @@ aarch64_scalar_builtin_type_p (aarch64_simd_type t)
   return (t == Poly8_t || t == Poly16_t || t == Poly64_t || t == Poly128_t);
 }
 
+/* Enable AARCH64_FL_* flags EXTRA_FLAGS on top of the base Advanced SIMD
+   set.  */
+aarch64_simd_switcher::aarch64_simd_switcher (unsigned int extra_flags)
+  : m_old_isa_flags (aarch64_isa_flags),
+    m_old_general_regs_only (TARGET_GENERAL_REGS_ONLY)
+{
+  /* Changing the ISA flags should be enough here.  We shouldn't need to
+     pay the compile-time cost of a full target switch.  */
+  aarch64_isa_flags = AARCH64_FL_FP | AARCH64_FL_SIMD | extra_flags;
+  global_options.x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
+}
+
+aarch64_simd_switcher::~aarch64_simd_switcher ()
+{
+  if (m_old_general_regs_only)
+    global_options.x_target_flags |= MASK_GENERAL_REGS_ONLY;
+  aarch64_isa_flags = m_old_isa_flags;
+}
+
 /* Implement #pragma GCC aarch64 "arm_neon.h".  */
 void
 handle_arm_neon_h (void)
 {
+  aarch64_simd_switcher simd;
+
   /* Register the AdvSIMD vector tuple types.  */
   for (unsigned int i = 0; i < ARM_NEON_H_TYPES_LAST; i++)
     for (unsigned int count = 2; count <= 4; ++count)
@@ -1411,7 +1434,7 @@ aarch64_init_builtin_rsqrt (void)
   };
 
   builtin_decls_data *bdd = bdda;
-  builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data));
+  builtin_decls_data *bdd_end = bdd + (ARRAY_SIZE (bdda));
 
   for (; bdd < bdd_end; bdd++)
   {
@@ -1641,6 +1664,14 @@ aarch64_init_ls64_builtins (void)
       = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
 }
 
+/* Implement #pragma GCC aarch64 "arm_acle.h".  */
+void
+handle_arm_acle_h (void)
+{
+  if (TARGET_LS64)
+    aarch64_init_ls64_builtins ();
+}
+
 /* Initialize fpsr fpcr getters and setters.  */
 
 static void
@@ -1703,8 +1734,10 @@ aarch64_general_init_builtins (void)
 
   aarch64_init_bf16_types ();
 
-  if (TARGET_SIMD)
+  {
+    aarch64_simd_switcher simd;
     aarch64_init_simd_builtins ();
+  }
 
   aarch64_init_crc32_builtins ();
   aarch64_init_builtin_rsqrt ();
@@ -1730,9 +1763,6 @@ aarch64_general_init_builtins (void)
 
   if (TARGET_MEMTAG)
     aarch64_init_memtag_builtins ();
-
-  if (TARGET_LS64)
-    aarch64_init_ls64_builtins ();
 }
 
 /* Implement TARGET_BUILTIN_DECL for the AARCH64_BUILTIN_GENERAL group.  */
diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index caf8e33..3d2fb5e 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -82,7 +82,7 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
 {
   aarch64_def_or_undef (flag_unsafe_math_optimizations, "__ARM_FP_FAST", pfile);
 
-  builtin_define_with_int_value ("__ARM_ARCH", aarch64_architecture_version);
+  builtin_define_with_int_value ("__ARM_ARCH", AARCH64_ISA_V9 ? 9 : 8);
 
   builtin_define_with_int_value ("__ARM_SIZEOF_MINIMAL_ENUM",
 				 flag_short_enums ? 1 : 4);
@@ -302,6 +302,8 @@ aarch64_pragma_aarch64 (cpp_reader *)
     aarch64_sve::handle_arm_sve_h ();
   else if (strcmp (name, "arm_neon.h") == 0)
     handle_arm_neon_h ();
+  else if (strcmp (name, "arm_acle.h") == 0)
+    handle_arm_acle_h ();
   else
     error ("unknown %<#pragma GCC aarch64%> option %qs", name);
 }
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index eda3a4e..41d9535 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -145,9 +145,6 @@ AARCH64_CORE("neoverse-512tvb", neoverse512tvb, cortexa57, 8_4A,  AARCH64_FL_FOR
 /* Qualcomm ('Q') cores. */
 AARCH64_CORE("saphira",     saphira,    saphira,    8_4A,  AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   0x51, 0xC01, -1)
 
-/* Armv8.5-A Architecture Processors.  */
-AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, 8_5A, AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG, neoversen2, 0x41, 0xd49, -1)
-
 /* ARMv8-A big.LITTLE implementations.  */
 
 AARCH64_CORE("cortex-a57.cortex-a53",  cortexa57cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, AARCH64_BIG_LITTLE (0xd07, 0xd03), -1)
@@ -172,4 +169,8 @@ AARCH64_CORE("cortex-a710",  cortexa710, cortexa57, 9A,  AARCH64_FL_FOR_ARCH9 |
 
 AARCH64_CORE("cortex-x2",  cortexx2, cortexa57, 9A,  AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, neoversen2, 0x41, 0xd48, -1)
 
+AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG | AARCH64_FL_PROFILE, neoversen2, 0x41, 0xd49, -1)
+
+AARCH64_CORE("demeter", demeter, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG | AARCH64_FL_PROFILE, demeter, 0x41, 0xd4f, -1)
+
 #undef AARCH64_CORE
diff --git a/gcc/config/aarch64/aarch64-opts.h b/gcc/config/aarch64/aarch64-opts.h
index 93572fe..421648a 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -98,4 +98,10 @@ enum stack_protector_guard {
   SSP_GLOBAL			/* global canary */
 };
 
+/* The key type that -msign-return-address should use.  */
+enum aarch64_key_type {
+  AARCH64_KEY_A,
+  AARCH64_KEY_B
+};
+
 #endif
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index d0e78d6..dabd047 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -507,6 +507,18 @@ struct cpu_prefetch_tune
   const int default_opt_level;
 };
 
+/* Model the costs for loads/stores for the register allocators so that it can
+   do more accurate spill heuristics.  */
+struct cpu_memmov_cost
+{
+  int load_int;
+  int store_int;
+  int load_fp;
+  int store_fp;
+  int load_pred;
+  int store_pred;
+};
+
 struct tune_params
 {
   const struct cpu_cost_table *insn_extra_cost;
@@ -519,7 +531,8 @@ struct tune_params
      or SVE_NOT_IMPLEMENTED if not applicable.  Only used for tuning
      decisions, does not disable VLA vectorization.  */
   unsigned int sve_width;
-  int memmov_cost;
+  /* Structure used by reload to cost spills.  */
+  struct cpu_memmov_cost memmov_cost;
   int issue_rate;
   unsigned int fusible_ops;
   const char *function_align;
@@ -659,14 +672,6 @@ enum simd_immediate_check {
   AARCH64_CHECK_MOV  = AARCH64_CHECK_ORR | AARCH64_CHECK_BIC
 };
 
-/* The key type that -msign-return-address should use.  */
-enum aarch64_key_type {
-  AARCH64_KEY_A,
-  AARCH64_KEY_B
-};
-
-extern enum aarch64_key_type aarch64_ra_sign_key;
-
 extern struct tune_params aarch64_tune_params;
 
 /* The available SVE predicate patterns, known in the ACLE as "svpattern".  */
@@ -733,6 +738,19 @@ const unsigned int AARCH64_BUILTIN_SHIFT = 1;
 /* Mask that selects the aarch64_builtin_class part of a function code.  */
 const unsigned int AARCH64_BUILTIN_CLASS = (1 << AARCH64_BUILTIN_SHIFT) - 1;
 
+/* RAII class for enabling enough features to define built-in types
+   and implement the arm_neon.h pragma.  */
+class aarch64_simd_switcher
+{
+public:
+  aarch64_simd_switcher (unsigned int extra_flags = 0);
+  ~aarch64_simd_switcher ();
+
+private:
+  unsigned long m_old_isa_flags;
+  bool m_old_general_regs_only;
+};
+
 void aarch64_post_cfi_startproc (void);
 poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
 int aarch64_get_condition_code (rtx);
@@ -969,6 +987,7 @@ rtx aarch64_general_expand_builtin (unsigned int, tree, rtx, int);
 tree aarch64_general_builtin_decl (unsigned, bool);
 tree aarch64_general_builtin_rsqrt (unsigned int);
 tree aarch64_builtin_vectorized_function (unsigned int, tree, tree);
+void handle_arm_acle_h (void);
 void handle_arm_neon_h (void);
 
 namespace aarch64_sve {
@@ -1038,7 +1057,7 @@ bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT);
 
 struct atomic_ool_names
 {
-    const char *str[5][4];
+    const char *str[5][5];
 };
 
 rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 1873342..a00e1c6 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3385,20 +3385,6 @@
 
 ;; 'across lanes' add.
 
-(define_expand "reduc_plus_scal_<mode>"
-  [(match_operand:<VEL> 0 "register_operand")
-   (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand")]
-	       UNSPEC_ADDV)]
-  "TARGET_SIMD"
-  {
-    rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
-    rtx scratch = gen_reg_rtx (<MODE>mode);
-    emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
-    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
-    DONE;
-  }
-)
-
 (define_insn "aarch64_faddp<mode>"
  [(set (match_operand:VHSDF 0 "register_operand" "=w")
        (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
@@ -3409,31 +3395,22 @@
   [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )
 
-(define_insn "aarch64_reduc_plus_internal<mode>"
- [(set (match_operand:VDQV 0 "register_operand" "=w")
-       (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
+(define_insn "reduc_plus_scal_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (unspec:<VEL> [(match_operand:VDQV 1 "register_operand" "w")]
 		    UNSPEC_ADDV))]
  "TARGET_SIMD"
  "add<VDQV:vp>\\t%<Vetype>0, %1.<Vtype>"
   [(set_attr "type" "neon_reduc_add<q>")]
 )
 
-(define_insn "aarch64_<su>addlv<mode>"
- [(set (match_operand:<VWIDE_S> 0 "register_operand" "=w")
-       (unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")]
-		    USADDLV))]
- "TARGET_SIMD"
- "<su>addl<vp>\\t%<Vwstype>0<Vwsuf>, %1.<Vtype>"
-  [(set_attr "type" "neon_reduc_add<q>")]
-)
-
-(define_insn "aarch64_<su>addlp<mode>"
- [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
-       (unspec:<VDBLW> [(match_operand:VDQV_L 1 "register_operand" "w")]
-		    USADDLP))]
+(define_insn "reduc_plus_scal_v2si"
+ [(set (match_operand:SI 0 "register_operand" "=w")
+       (unspec:SI [(match_operand:V2SI 1 "register_operand" "w")]
+		    UNSPEC_ADDV))]
  "TARGET_SIMD"
- "<su>addlp\\t%0.<Vwhalf>, %1.<Vtype>"
-  [(set_attr "type" "neon_reduc_add<q>")]
+ "addp\\t%0.2s, %1.2s, %1.2s"
+  [(set_attr "type" "neon_reduc_add")]
 )
 
 ;; ADDV with result zero-extended to SI/DImode (for popcount).
@@ -3447,15 +3424,6 @@
   [(set_attr "type" "neon_reduc_add<VDQV_E:q>")]
 )
 
-(define_insn "aarch64_reduc_plus_internalv2si"
- [(set (match_operand:V2SI 0 "register_operand" "=w")
-       (unspec:V2SI [(match_operand:V2SI 1 "register_operand" "w")]
-		    UNSPEC_ADDV))]
- "TARGET_SIMD"
- "addp\\t%0.2s, %1.2s, %1.2s"
-  [(set_attr "type" "neon_reduc_add")]
-)
-
 (define_insn "reduc_plus_scal_<mode>"
  [(set (match_operand:<VEL> 0 "register_operand" "=w")
        (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
@@ -3467,7 +3435,7 @@
 
 (define_expand "reduc_plus_scal_v4sf"
  [(set (match_operand:SF 0 "register_operand")
-       (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
+       (unspec:SF [(match_operand:V4SF 1 "register_operand")]
 		    UNSPEC_FADDV))]
  "TARGET_SIMD"
 {
@@ -3479,6 +3447,24 @@
   DONE;
 })
 
+(define_insn "aarch64_<su>addlv<mode>"
+ [(set (match_operand:<VWIDE_S> 0 "register_operand" "=w")
+       (unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")]
+		    USADDLV))]
+ "TARGET_SIMD"
+ "<su>addl<vp>\\t%<Vwstype>0<Vwsuf>, %1.<Vtype>"
+  [(set_attr "type" "neon_reduc_add<q>")]
+)
+
+(define_insn "aarch64_<su>addlp<mode>"
+ [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
+       (unspec:<VDBLW> [(match_operand:VDQV_L 1 "register_operand" "w")]
+		    USADDLP))]
+ "TARGET_SIMD"
+ "<su>addlp\\t%0.<Vwhalf>, %1.<Vtype>"
+  [(set_attr "type" "neon_reduc_add<q>")]
+)
+
 (define_insn "clrsb<mode>2"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
         (clrsb:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index c24c054..82f9eba 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,7 @@
 #include "aarch64-sve-builtins-shapes.h"
 #include "aarch64-sve-builtins-base.h"
 #include "aarch64-sve-builtins-functions.h"
+#include "ssa.h"
 
 using namespace aarch64_sve;
 
@@ -148,7 +149,7 @@ class svabd_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* The integer operations are represented as the subtraction of the
        minimum from the maximum, with the signedness of the instruction
@@ -179,7 +180,7 @@ public:
   CONSTEXPR svac_impl (int unspec) : m_unspec (unspec) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.add_ptrue_hint (0, e.gp_mode (0));
     insn_code icode = code_for_aarch64_pred_fac (m_unspec, e.vector_mode (0));
@@ -194,7 +195,7 @@ class svadda_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Put the predicate last, as required by mask_fold_left_plus_optab.  */
     e.rotate_inputs_left (0, 3);
@@ -211,7 +212,7 @@ public:
   CONSTEXPR svadr_bhwd_impl (unsigned int shift) : m_shift (shift) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = GET_MODE (e.args[0]);
     if (m_shift == 0)
@@ -231,7 +232,7 @@ class svbic_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Convert svbic of a constant into svand of its inverse.  */
     if (CONST_INT_P (e.args[2]))
@@ -261,7 +262,7 @@ public:
   CONSTEXPR svbrk_binary_impl (int unspec) : m_unspec (unspec) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_exact_insn (code_for_aarch64_brk (m_unspec));
   }
@@ -277,7 +278,7 @@ public:
   CONSTEXPR svbrk_unary_impl (int unspec) : m_unspec (unspec) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_cond_insn (code_for_aarch64_brk (m_unspec));
   }
@@ -290,7 +291,7 @@ class svcadd_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Convert the rotation amount into a specific unspec.  */
     int rot = INTVAL (e.args.pop ());
@@ -311,7 +312,7 @@ public:
   CONSTEXPR svclast_impl (int unspec) : m_unspec (unspec) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Match the fold_extract_optab order.  */
     std::swap (e.args[0], e.args[1]);
@@ -332,7 +333,7 @@ class svcmla_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Convert the rotation amount into a specific unspec.  */
     int rot = INTVAL (e.args.pop ());
@@ -355,7 +356,7 @@ class svcmla_lane_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Convert the rotation amount into a specific unspec.  */
     int rot = INTVAL (e.args.pop ());
@@ -384,7 +385,7 @@ public:
     : m_code (code), m_unspec_for_fp (unspec_for_fp) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree pg = gimple_call_arg (f.call, 0);
     tree rhs1 = gimple_call_arg (f.call, 1);
@@ -406,7 +407,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
 
@@ -442,7 +443,7 @@ public:
       m_unspec_for_uint (unspec_for_uint) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     bool unsigned_p = e.type_suffix (0).unsigned_p;
@@ -480,7 +481,7 @@ class svcmpuo_impl : public quiet<function_base>
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.add_ptrue_hint (0, e.gp_mode (0));
     return e.use_exact_insn (code_for_aarch64_pred_fcmuo (e.vector_mode (0)));
@@ -491,7 +492,7 @@ class svcnot_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     if (e.pred == PRED_x)
@@ -514,7 +515,7 @@ public:
   CONSTEXPR svcnt_bhwd_impl (machine_mode ref_mode) : m_ref_mode (ref_mode) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree count = build_int_cstu (TREE_TYPE (f.lhs),
 				 GET_MODE_NUNITS (m_ref_mode));
@@ -522,7 +523,7 @@ public:
   }
 
   rtx
-  expand (function_expander &) const OVERRIDE
+  expand (function_expander &) const override
   {
     return gen_int_mode (GET_MODE_NUNITS (m_ref_mode), DImode);
   }
@@ -539,7 +540,7 @@ public:
     : svcnt_bhwd_impl (ref_mode) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree pattern_arg = gimple_call_arg (f.call, 0);
     aarch64_svpattern pattern = (aarch64_svpattern) tree_to_shwi (pattern_arg);
@@ -562,7 +563,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     unsigned int elements_per_vq = 128 / GET_MODE_UNIT_BITSIZE (m_ref_mode);
     e.args.quick_push (gen_int_mode (elements_per_vq, DImode));
@@ -575,7 +576,7 @@ class svcntp_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     e.add_ptrue_hint (0, mode);
@@ -591,7 +592,7 @@ public:
     : quiet<multi_vector_function> (vectors_per_tuple) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     unsigned int nargs = gimple_call_num_args (f.call);
     tree lhs_type = TREE_TYPE (f.lhs);
@@ -621,7 +622,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     rtx lhs_tuple = e.get_nonoverlapping_reg_target ();
 
@@ -643,7 +644,7 @@ class svcvt_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode0 = e.vector_mode (0);
     machine_mode mode1 = e.vector_mode (1);
@@ -706,7 +707,7 @@ class svdot_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* In the optab, the multiplication operands come before the accumulator
        operand.  The optab is keyed off the multiplication mode.  */
@@ -729,7 +730,7 @@ public:
 				  unspec_for_float) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Use the same ordering as the dot_prod_optab, with the
        accumulator last.  */
@@ -744,7 +745,7 @@ class svdup_impl : public quiet<function_base>
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree vec_type = TREE_TYPE (f.lhs);
     tree rhs = gimple_call_arg (f.call, f.pred == PRED_none ? 0 : 1);
@@ -784,7 +785,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     if (e.pred == PRED_none || e.pred == PRED_x)
       /* There's no benefit to using predicated instructions for _x here.  */
@@ -812,7 +813,7 @@ class svdup_lane_impl : public quiet<function_base>
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* The native DUP lane has an index range of 64 bytes.  */
     machine_mode mode = e.vector_mode (0);
@@ -829,7 +830,7 @@ class svdupq_impl : public quiet<function_base>
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree vec_type = TREE_TYPE (f.lhs);
     unsigned int nargs = gimple_call_num_args (f.call);
@@ -851,7 +852,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     unsigned int elements_per_vq = e.args.length ();
@@ -900,7 +901,7 @@ class svdupq_lane_impl : public quiet<function_base>
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     rtx index = e.args[1];
@@ -964,7 +965,7 @@ public:
     : m_from_mode (from_mode) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     if (e.type_suffix (0).unsigned_p)
       {
@@ -1006,7 +1007,7 @@ public:
     : quiet<multi_vector_function> (vectors_per_tuple) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     /* Fold into a normal gimple component access.  */
     tree rhs_tuple = gimple_call_arg (f.call, 0);
@@ -1020,7 +1021,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Fold the access into a subreg rvalue.  */
     return simplify_gen_subreg (e.vector_mode (0), e.args[0],
@@ -1033,7 +1034,7 @@ class svindex_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_exact_insn (e.direct_optab_handler (vec_series_optab));
   }
@@ -1043,7 +1044,7 @@ class svinsr_impl : public quiet<function_base>
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     gcall *new_call = gimple_build_call_internal (IFN_VEC_SHL_INSERT, 2,
 						  gimple_call_arg (f.call, 0),
@@ -1053,7 +1054,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode = direct_optab_handler (vec_shl_insert_optab,
 					    e.vector_mode (0));
@@ -1068,7 +1069,7 @@ public:
   CONSTEXPR svlast_impl (int unspec) : m_unspec (unspec) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_exact_insn (code_for_extract (m_unspec, e.vector_mode (0)));
   }
@@ -1081,13 +1082,13 @@ class svld1_impl : public full_width_access
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_MEMORY;
   }
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree vectype = f.vector_type (0);
 
@@ -1105,7 +1106,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode = convert_optab_handler (maskload_optab,
 					     e.vector_mode (0), e.gp_mode (0));
@@ -1121,7 +1122,7 @@ public:
     : extending_load (memory_type) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode = code_for_aarch64_load (UNSPEC_LD1_SVE, extend_rtx_code (),
 					     e.vector_mode (0),
@@ -1134,13 +1135,13 @@ class svld1_gather_impl : public full_width_access
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_MEMORY;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.prepare_gather_address_operands (1);
     /* Put the predicate last, as required by mask_gather_load_optab.  */
@@ -1161,7 +1162,7 @@ public:
     : extending_load (memory_type) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.prepare_gather_address_operands (1);
     /* Put the predicate last, since the extending gathers use the same
@@ -1180,13 +1181,13 @@ class load_replicate : public function_base
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_MEMORY;
   }
 
   tree
-  memory_scalar_type (const function_instance &fi) const OVERRIDE
+  memory_scalar_type (const function_instance &fi) const override
   {
     return fi.scalar_type (0);
   }
@@ -1196,30 +1197,88 @@ class svld1rq_impl : public load_replicate
 {
 public:
   machine_mode
-  memory_vector_mode (const function_instance &fi) const OVERRIDE
+  memory_vector_mode (const function_instance &fi) const override
   {
     return aarch64_vq_mode (GET_MODE_INNER (fi.vector_mode (0))).require ();
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<vectype> [(elem * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.
+       vectype is the corresponding ADVSIMD type.  */
+
+    if (!BYTES_BIG_ENDIAN
+	&& integer_all_onesp (arg0))
+      {
+	tree lhs = gimple_call_lhs (f.call);
+	tree lhs_type = TREE_TYPE (lhs);
+	poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+	tree eltype = TREE_TYPE (lhs_type);
+
+	scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
+	machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
+	tree vectype = build_vector_type_for_mode (eltype, vq_mode);
+
+	tree elt_ptr_type
+	  = build_pointer_type_for_mode (eltype, VOIDmode, true);
+	tree zero = build_zero_cst (elt_ptr_type);
+
+	/* Use element type alignment.  */
+	tree access_type
+	  = build_aligned_type (vectype, TYPE_ALIGN (eltype));
+
+	tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
+	tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
+	gimple *mem_ref_stmt
+	  = gimple_build_assign (mem_ref_lhs, mem_ref_op);
+	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+	int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+	vec_perm_builder sel (lhs_len, source_nelts, 1);
+	for (int i = 0; i < source_nelts; i++)
+	  sel.quick_push (i);
+
+	vec_perm_indices indices (sel, 1, source_nelts);
+	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
+						   TYPE_MODE (access_type),
+						   indices));
+	tree mask_type = build_vector_type (ssizetype, lhs_len);
+	tree mask = vec_perm_indices_to_tree (mask_type, indices);
+	return gimple_build_assign (lhs, VEC_PERM_EXPR,
+				    mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
 {
 public:
   machine_mode
-  memory_vector_mode (const function_instance &) const OVERRIDE
+  memory_vector_mode (const function_instance &) const override
   {
     return OImode;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode = code_for_aarch64_sve_ld1ro (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
@@ -1234,13 +1293,13 @@ public:
     : full_width_access (vectors_per_tuple) {}
 
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_MEMORY;
   }
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree tuple_type = TREE_TYPE (f.lhs);
     tree vectype = f.vector_type (0);
@@ -1275,7 +1334,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode tuple_mode = TYPE_MODE (TREE_TYPE (e.call_expr));
     insn_code icode = convert_optab_handler (vec_mask_load_lanes_optab,
@@ -1288,13 +1347,13 @@ class svldff1_gather_impl : public full_width_access
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_MEMORY | CP_READ_FFR | CP_WRITE_FFR;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* See the block comment in aarch64-sve.md for details about the
        FFR handling.  */
@@ -1317,7 +1376,7 @@ public:
     : extending_load (memory_type) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* See the block comment in aarch64-sve.md for details about the
        FFR handling.  */
@@ -1340,13 +1399,13 @@ class svldnt1_impl : public full_width_access
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_MEMORY;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode = code_for_aarch64_ldnt1 (e.vector_mode (0));
     return e.use_contiguous_load_insn (icode);
@@ -1360,13 +1419,13 @@ public:
   CONSTEXPR svldxf1_impl (int unspec) : m_unspec (unspec) {}
 
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_MEMORY | CP_READ_FFR | CP_WRITE_FFR;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* See the block comment in aarch64-sve.md for details about the
        FFR handling.  */
@@ -1388,13 +1447,13 @@ public:
     : extending_load (memory_type), m_unspec (unspec) {}
 
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_MEMORY | CP_READ_FFR | CP_WRITE_FFR;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* See the block comment in aarch64-sve.md for details about the
        FFR handling.  */
@@ -1414,7 +1473,7 @@ class svlen_impl : public quiet<function_base>
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     /* The argument only exists for its type.  */
     tree rhs_type = TREE_TYPE (gimple_call_arg (f.call, 0));
@@ -1424,7 +1483,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* The argument only exists for its type.  */
     return gen_int_mode (GET_MODE_NUNITS (e.vector_mode (0)), DImode);
@@ -1435,7 +1494,7 @@ class svmad_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return expand_mad (e);
   }
@@ -1445,7 +1504,7 @@ class svmla_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Put the accumulator at the end (argument 3), but keep it as the
        merge input for _m functions.  */
@@ -1458,7 +1517,7 @@ class svmla_lane_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     if (e.type_suffix (0).integer_p)
       {
@@ -1473,7 +1532,7 @@ class svmls_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Put the accumulator at the end (argument 3), but keep it as the
        merge input for _m functions.  */
@@ -1486,7 +1545,7 @@ class svmov_impl : public function_base
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     return gimple_build_assign (f.lhs, BIT_AND_EXPR,
 				gimple_call_arg (f.call, 0),
@@ -1494,7 +1553,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* The canonical form for the assembler alias "MOV Pa.B, Pb/Z, Pc.B"
        is "AND Pa.B, Pb/Z, Pc.B, Pc.B".  */
@@ -1508,7 +1567,7 @@ class svmls_lane_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     if (e.type_suffix (0).integer_p)
       {
@@ -1523,7 +1582,7 @@ class svmmla_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode;
     if (e.type_suffix (0).integer_p)
@@ -1543,7 +1602,7 @@ class svmsb_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return expand_msb (e);
   }
@@ -1553,7 +1612,7 @@ class svnand_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     gcc_assert (e.pred == PRED_z);
     return e.use_exact_insn (CODE_FOR_aarch64_pred_nandvnx16bi_z);
@@ -1564,7 +1623,7 @@ class svnor_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     gcc_assert (e.pred == PRED_z);
     return e.use_exact_insn (CODE_FOR_aarch64_pred_norvnx16bi_z);
@@ -1577,7 +1636,7 @@ public:
   CONSTEXPR svnot_impl () : rtx_code_function (NOT, NOT, -1) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     if (e.type_suffix_ids[0] == TYPE_SUFFIX_b)
       {
@@ -1595,7 +1654,7 @@ class svorn_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     gcc_assert (e.pred == PRED_z);
     return e.use_exact_insn (CODE_FOR_aarch64_pred_ornvnx16bi_z);
@@ -1606,13 +1665,13 @@ class svpfalse_impl : public function_base
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     return f.fold_to_pfalse ();
   }
 
   rtx
-  expand (function_expander &) const OVERRIDE
+  expand (function_expander &) const override
   {
     return CONST0_RTX (VNx16BImode);
   }
@@ -1625,7 +1684,7 @@ public:
   CONSTEXPR svpfirst_svpnext_impl (int unspec) : m_unspec (unspec) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     e.add_ptrue_hint (0, mode);
@@ -1643,13 +1702,13 @@ public:
   CONSTEXPR svprf_bhwd_impl (machine_mode mode) : m_mode (mode) {}
 
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_PREFETCH_MEMORY;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.prepare_prefetch_operands ();
     insn_code icode = code_for_aarch64_sve_prefetch (m_mode);
@@ -1667,19 +1726,19 @@ public:
   CONSTEXPR svprf_bhwd_gather_impl (machine_mode mode) : m_mode (mode) {}
 
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_PREFETCH_MEMORY;
   }
 
   machine_mode
-  memory_vector_mode (const function_instance &) const OVERRIDE
+  memory_vector_mode (const function_instance &) const override
   {
     return m_mode;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.prepare_prefetch_operands ();
     e.prepare_gather_address_operands (1);
@@ -1705,7 +1764,7 @@ public:
   CONSTEXPR svptest_impl (rtx_code compare) : m_compare (compare) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* See whether GP is an exact ptrue for some predicate mode;
        i.e. whether converting the GP to that mode will not drop
@@ -1751,13 +1810,13 @@ class svptrue_impl : public function_base
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     return f.fold_to_ptrue ();
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return aarch64_ptrue_all (e.type_suffix (0).element_bytes);
   }
@@ -1767,7 +1826,7 @@ class svptrue_pat_impl : public function_base
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree pattern_arg = gimple_call_arg (f.call, 0);
     aarch64_svpattern pattern = (aarch64_svpattern) tree_to_shwi (pattern_arg);
@@ -1788,7 +1847,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* In rtl, the predicate is represented as the constant:
 
@@ -1816,7 +1875,7 @@ public:
   {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Treat non-_pat functions in the same way as _pat functions with
        an SV_ALL argument.  */
@@ -1877,7 +1936,7 @@ public:
   {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     rtx_code code = (e.type_suffix (0).unsigned_p
 		     ? m_code_for_uint
@@ -1908,13 +1967,13 @@ class svrdffr_impl : public function_base
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_FFR;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* See the block comment in aarch64-sve.md for details about the
        FFR handling.  */
@@ -1931,7 +1990,7 @@ class svreinterpret_impl : public quiet<function_base>
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     /* Punt to rtl if the effect of the reinterpret on registers does not
        conform to GCC's endianness model.  */
@@ -1947,7 +2006,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     return e.use_exact_insn (code_for_aarch64_sve_reinterpret (mode));
@@ -1958,7 +2017,7 @@ class svrev_impl : public permute
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     /* Punt for now on _b16 and wider; we'd need more complex evpc logic
        to rerecognize the result.  */
@@ -1974,7 +2033,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_exact_insn (code_for_aarch64_sve_rev (e.vector_mode (0)));
   }
@@ -1984,7 +2043,7 @@ class svsel_impl : public quiet<function_base>
 {
 public:
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     /* svsel corresponds exactly to VEC_COND_EXPR.  */
     gimple_seq stmts = NULL;
@@ -1996,7 +2055,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* svsel (cond, truev, falsev) is vcond_mask (truev, falsev, cond).  */
     e.rotate_inputs_left (0, 3);
@@ -2015,7 +2074,7 @@ public:
     : quiet<multi_vector_function> (vectors_per_tuple) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree rhs_tuple = gimple_call_arg (f.call, 0);
     tree index = gimple_call_arg (f.call, 1);
@@ -2042,7 +2101,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     rtx rhs_tuple = e.args[0];
     unsigned int index = INTVAL (e.args[1]);
@@ -2065,13 +2124,13 @@ class svsetffr_impl : public function_base
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_WRITE_FFR;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.args.quick_push (CONSTM1_RTX (VNx16BImode));
     return e.use_exact_insn (CODE_FOR_aarch64_wrffr);
@@ -2082,13 +2141,13 @@ class svst1_impl : public full_width_access
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_WRITE_MEMORY;
   }
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree vectype = f.vector_type (0);
 
@@ -2105,7 +2164,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode = convert_optab_handler (maskstore_optab,
 					     e.vector_mode (0), e.gp_mode (0));
@@ -2117,13 +2176,13 @@ class svst1_scatter_impl : public full_width_access
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_WRITE_MEMORY;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.prepare_gather_address_operands (1);
     /* Put the predicate last, as required by mask_scatter_store_optab.  */
@@ -2144,7 +2203,7 @@ public:
     : truncating_store (to_mode) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.prepare_gather_address_operands (1);
     /* Put the predicate last, since the truncating scatters use the same
@@ -2164,7 +2223,7 @@ public:
     : truncating_store (to_mode) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode = code_for_aarch64_store_trunc (e.memory_vector_mode (),
 						    e.vector_mode (0));
@@ -2180,13 +2239,13 @@ public:
     : full_width_access (vectors_per_tuple) {}
 
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_WRITE_MEMORY;
   }
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     tree vectype = f.vector_type (0);
 
@@ -2208,7 +2267,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode tuple_mode = GET_MODE (e.args.last ());
     insn_code icode = convert_optab_handler (vec_mask_store_lanes_optab,
@@ -2221,13 +2280,13 @@ class svstnt1_impl : public full_width_access
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_WRITE_MEMORY;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode = code_for_aarch64_stnt1 (e.vector_mode (0));
     return e.use_contiguous_store_insn (icode);
@@ -2241,7 +2300,7 @@ public:
     : rtx_code_function (MINUS, MINUS, UNSPEC_COND_FSUB) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Canonicalize subtractions of constants to additions.  */
     machine_mode mode = e.vector_mode (0);
@@ -2256,7 +2315,7 @@ class svtbl_impl : public permute
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_exact_insn (code_for_aarch64_sve_tbl (e.vector_mode (0)));
   }
@@ -2270,7 +2329,7 @@ public:
     : binary_permute (base ? UNSPEC_TRN2 : UNSPEC_TRN1), m_base (base) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     /* svtrn1: { 0, nelts, 2, nelts + 2, 4, nelts + 4, ... }
        svtrn2: as for svtrn1, but with 1 added to each index.  */
@@ -2296,7 +2355,7 @@ public:
     : quiet<multi_vector_function> (vectors_per_tuple) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     rtx target = e.get_reg_target ();
     emit_clobber (copy_rtx (target));
@@ -2311,7 +2370,7 @@ public:
   CONSTEXPR svunpk_impl (bool high_p) : m_high_p (high_p) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     /* Don't fold the predicate ops, since every bit of the svbool_t
        result is significant.  */
@@ -2326,7 +2385,7 @@ public:
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = GET_MODE (e.args[0]);
     unsigned int unpacku = m_high_p ? UNSPEC_UNPACKUHI : UNSPEC_UNPACKULO;
@@ -2353,7 +2412,7 @@ public:
   CONSTEXPR svusdot_impl (bool su) : m_su (su) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* The implementation of the ACLE function svsudot (for the non-lane
        version) is through the USDOT instruction but with the second and third
@@ -2382,7 +2441,7 @@ public:
     : binary_permute (base ? UNSPEC_UZP2 : UNSPEC_UZP1), m_base (base) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     /* svuzp1: { 0, 2, 4, 6, ... }
        svuzp2: { 1, 3, 5, 7, ... }.  */
@@ -2456,7 +2515,7 @@ public:
   }
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     if (f.type_suffix (1).unsigned_p)
       return fold_type<poly_uint64> (f);
@@ -2472,13 +2531,13 @@ class svwrffr_impl : public function_base
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_WRITE_FFR;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_exact_insn (CODE_FOR_aarch64_wrffr);
   }
@@ -2492,7 +2551,7 @@ public:
     : binary_permute (base ? UNSPEC_ZIP2 : UNSPEC_ZIP1), m_base (base) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     /* svzip1: { 0, nelts, 1, nelts + 1, 2, nelts + 2, ... }
        svzip2: as for svzip1, but with nelts / 2 added to each index.  */
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-functions.h b/gcc/config/aarch64/aarch64-sve-builtins-functions.h
index 9d346b6..b8a86e3 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-functions.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins-functions.h
@@ -44,7 +44,7 @@ public:
     : T (t1, t2, t3) {}
 
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return 0;
   }
@@ -59,7 +59,7 @@ public:
     : m_vectors_per_tuple (vectors_per_tuple) {}
 
   unsigned int
-  vectors_per_tuple () const OVERRIDE
+  vectors_per_tuple () const override
   {
     return m_vectors_per_tuple;
   }
@@ -78,13 +78,13 @@ public:
     : multi_vector_function (vectors_per_tuple) {}
 
   tree
-  memory_scalar_type (const function_instance &fi) const OVERRIDE
+  memory_scalar_type (const function_instance &fi) const override
   {
     return fi.scalar_type (0);
   }
 
   machine_mode
-  memory_vector_mode (const function_instance &fi) const OVERRIDE
+  memory_vector_mode (const function_instance &fi) const override
   {
     machine_mode mode = fi.vector_mode (0);
     if (m_vectors_per_tuple != 1)
@@ -103,19 +103,19 @@ public:
     : m_memory_type (memory_type) {}
 
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_MEMORY;
   }
 
   tree
-  memory_scalar_type (const function_instance &) const OVERRIDE
+  memory_scalar_type (const function_instance &) const override
   {
     return scalar_types[type_suffixes[m_memory_type].vector_type];
   }
 
   machine_mode
-  memory_vector_mode (const function_instance &fi) const OVERRIDE
+  memory_vector_mode (const function_instance &fi) const override
   {
     machine_mode mem_mode = type_suffixes[m_memory_type].vector_mode;
     machine_mode reg_mode = fi.vector_mode (0);
@@ -145,13 +145,13 @@ public:
   CONSTEXPR truncating_store (scalar_int_mode to_mode) : m_to_mode (to_mode) {}
 
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_WRITE_MEMORY;
   }
 
   tree
-  memory_scalar_type (const function_instance &fi) const OVERRIDE
+  memory_scalar_type (const function_instance &fi) const override
   {
     /* In truncating stores, the signedness of the memory element is defined
        to be the same as the signedness of the vector element.  The signedness
@@ -163,7 +163,7 @@ public:
   }
 
   machine_mode
-  memory_vector_mode (const function_instance &fi) const OVERRIDE
+  memory_vector_mode (const function_instance &fi) const override
   {
     poly_uint64 nunits = GET_MODE_NUNITS (fi.vector_mode (0));
     return aarch64_sve_data_mode (m_to_mode, nunits).require ();
@@ -205,7 +205,7 @@ public:
     : rtx_code_function_base (code_for_sint, code_for_uint, unspec_for_fp) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.map_to_rtx_codes (m_code_for_sint, m_code_for_uint,
 			       m_unspec_for_fp);
@@ -225,7 +225,7 @@ public:
     : rtx_code_function_base (code_for_sint, code_for_uint, unspec_for_fp) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Rotate the inputs into their normal order, but continue to make _m
        functions merge with what was originally the first vector argument.  */
@@ -279,7 +279,7 @@ public:
   {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.map_to_unspecs (m_unspec_for_sint, m_unspec_for_uint,
 			     m_unspec_for_fp);
@@ -301,7 +301,7 @@ public:
   {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Rotate the inputs into their normal order, but continue to make _m
        functions merge with what was originally the first vector argument.  */
@@ -329,7 +329,7 @@ public:
   {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_exact_insn (CODE (unspec_for (e), e.vector_mode (0)));
   }
@@ -386,7 +386,7 @@ public:
   {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     int unspec = unspec_for (e);
     insn_code icode;
@@ -421,7 +421,7 @@ public:
   {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     int unspec = unspec_for (e);
     insn_code icode;
@@ -451,7 +451,7 @@ class code_for_mode_function : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_exact_insn (CODE_FOR_MODE (e.vector_mode (N)));
   }
@@ -477,7 +477,7 @@ public:
   CONSTEXPR fixed_insn_function (insn_code code) : m_code (code) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_exact_insn (m_code);
   }
@@ -519,7 +519,7 @@ public:
   CONSTEXPR binary_permute (int unspec) : m_unspec (unspec) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     insn_code icode = code_for_aarch64_sve (m_unspec, e.vector_mode (0));
     return e.use_exact_insn (icode);
@@ -547,7 +547,7 @@ public:
   {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     int unspec = (!e.type_suffix (0).integer_p ? m_unspec_for_fp
@@ -576,7 +576,7 @@ public:
     : m_code (code), m_wide_unspec (wide_unspec) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     machine_mode elem_mode = GET_MODE_INNER (mode);
@@ -610,7 +610,7 @@ public:
   CONSTEXPR unary_count (rtx_code code) : m_code (code) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* The md patterns treat the operand as an integer.  */
     machine_mode mode = aarch64_sve_int_mode (e.vector_mode (0));
@@ -636,7 +636,7 @@ public:
   {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Suffix 0 determines the predicate mode, suffix 1 determines the
        scalar mode and signedness.  */
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
index f57f926..8e26bd8 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
@@ -454,13 +454,13 @@ long_type_suffix (function_resolver &r, type_suffix_index type)
 struct nonoverloaded_base : public function_shape
 {
   bool
-  explicit_type_suffix_p (unsigned int) const OVERRIDE
+  explicit_type_suffix_p (unsigned int) const override
   {
     return true;
   }
 
   tree
-  resolve (function_resolver &) const OVERRIDE
+  resolve (function_resolver &) const override
   {
     gcc_unreachable ();
   }
@@ -472,7 +472,7 @@ template<unsigned int EXPLICIT_MASK>
 struct overloaded_base : public function_shape
 {
   bool
-  explicit_type_suffix_p (unsigned int i) const OVERRIDE
+  explicit_type_suffix_p (unsigned int i) const override
   {
     return (EXPLICIT_MASK >> i) & 1;
   }
@@ -484,7 +484,7 @@ struct adr_base : public overloaded_base<0>
   /* The function takes two arguments: a vector base and a vector displacement
      (either an index or an offset).  Resolve based on them both.  */
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     mode_suffix_index mode;
@@ -503,7 +503,7 @@ template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
 struct binary_imm_narrowb_base : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_n);
     STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
@@ -515,7 +515,7 @@ struct binary_imm_narrowb_base : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (1, 1);
   }
@@ -528,7 +528,7 @@ template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
 struct binary_imm_narrowt_base : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_n);
     STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
@@ -540,7 +540,7 @@ struct binary_imm_narrowt_base : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -560,14 +560,14 @@ struct binary_imm_narrowt_base : public overloaded_base<0>
 struct binary_imm_long_base : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_n);
     build_all (b, "v0,vh0,su64", group, MODE_n);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type, result_type;
@@ -623,7 +623,7 @@ struct inc_dec_base : public overloaded_base<0>
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_range (m_pat_p ? 2 : 1, 1, 16);
   }
@@ -637,7 +637,7 @@ struct load_contiguous_base : public overloaded_base<0>
   /* Resolve a call based purely on a pointer argument.  The other arguments
      are a governing predicate and (for MODE_vnum) a vnum offset.  */
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     bool vnum_p = r.mode_suffix_id == MODE_vnum;
     gcc_assert (r.mode_suffix_id == MODE_none || vnum_p);
@@ -658,7 +658,7 @@ struct load_contiguous_base : public overloaded_base<0>
 struct load_gather_sv_base : public overloaded_base<0>
 {
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     mode_suffix_index mode;
@@ -686,7 +686,7 @@ struct load_ext_gather_base : public overloaded_base<1>
      The function has an explicit type suffix that determines the type
      of the loaded data.  */
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     /* No resolution is needed for a vector base with no displacement;
        there's a one-to-one mapping between short and long names.  */
@@ -713,7 +713,7 @@ struct load_ext_gather_base : public overloaded_base<1>
 struct mmla_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     /* svmmla is distributed over several extensions.  Allow the common
@@ -729,7 +729,7 @@ struct mmla_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -769,7 +769,7 @@ struct prefetch_gather_base : public overloaded_base<0>
      The prefetch operation is the final argument.  This is purely a
      mode-based resolution; there are no type suffixes.  */
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     bool has_displacement_p = r.displacement_units () != UNITS_none;
 
@@ -791,7 +791,7 @@ template<typename BASE, unsigned int N>
 struct shift_right_imm_narrow_wrapper : public BASE
 {
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     unsigned int bits = c.type_suffix (0).element_bits / 2;
     return c.require_immediate_range (N, 1, bits);
@@ -811,7 +811,7 @@ struct store_scatter_base : public overloaded_base<0>
      The stored data is the final argument, and it determines the
      type suffix.  */
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     bool has_displacement_p = r.displacement_units () != UNITS_none;
 
@@ -832,14 +832,14 @@ struct store_scatter_base : public overloaded_base<0>
 struct ternary_shift_imm_base : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_n);
     build_all (b, "v0,v0,v0,su64", group, MODE_n);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (2, 1);
   }
@@ -862,7 +862,7 @@ template<unsigned int MODIFIER,
 struct ternary_resize2_opt_n_base : public overloaded_base<0>
 {
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -884,7 +884,7 @@ template<unsigned int MODIFIER,
 struct ternary_resize2_base : public overloaded_base<0>
 {
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -908,7 +908,7 @@ template<unsigned int MODIFIER,
 struct ternary_resize2_lane_base : public overloaded_base<0>
 {
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -932,14 +932,14 @@ struct ternary_bfloat_lane_base
   : public ternary_resize2_lane_base<16, TYPE_bfloat, TYPE_bfloat>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vB,vB,su64", group, MODE_none);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_lane_index (3, N);
   }
@@ -954,7 +954,7 @@ struct ternary_qq_lane_base
 				     TYPE_CLASS2, TYPE_CLASS3>
 {
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_lane_index (3, 4);
   }
@@ -966,7 +966,7 @@ template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
 struct unary_narrowb_base : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
@@ -978,7 +978,7 @@ struct unary_narrowb_base : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_unary (CLASS, r.HALF_SIZE);
   }
@@ -991,7 +991,7 @@ template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
 struct unary_narrowt_base : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
@@ -1003,7 +1003,7 @@ struct unary_narrowt_base : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1023,7 +1023,7 @@ struct unary_narrowt_base : public overloaded_base<0>
 struct adr_index_def : public adr_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_index);
     build_all (b, "b,b,d", group, MODE_u32base_s32index);
@@ -1041,7 +1041,7 @@ SHAPE (adr_index)
 struct adr_offset_def : public adr_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_offset);
     build_all (b, "b,b,d", group, MODE_u32base_s32offset);
@@ -1058,14 +1058,14 @@ SHAPE (adr_offset)
 struct binary_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (2);
   }
@@ -1080,7 +1080,7 @@ SHAPE (binary)
 struct binary_int_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vs0", group, MODE_none);
@@ -1088,7 +1088,7 @@ struct binary_int_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1108,20 +1108,20 @@ SHAPE (binary_int_opt_n)
 struct binary_lane_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (2, 1);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_lane_index (2);
   }
@@ -1135,14 +1135,14 @@ SHAPE (binary_lane)
 struct binary_long_lane_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,vh0,vh0,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type, result_type;
@@ -1160,7 +1160,7 @@ struct binary_long_lane_def : public overloaded_base<0>
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_lane_index (2);
   }
@@ -1172,7 +1172,7 @@ SHAPE (binary_long_lane)
 struct binary_long_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,vh0,vh0", group, MODE_none);
@@ -1180,7 +1180,7 @@ struct binary_long_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type, result_type;
@@ -1202,14 +1202,14 @@ SHAPE (binary_long_opt_n)
 struct binary_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_n);
     build_all (b, "v0,v0,s0", group, MODE_n);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1231,7 +1231,7 @@ SHAPE (binary_n)
 struct binary_narrowb_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "vh0,v0,v0", group, MODE_none);
@@ -1239,7 +1239,7 @@ struct binary_narrowb_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform_opt_n (2);
   }
@@ -1253,7 +1253,7 @@ SHAPE (binary_narrowb_opt_n)
 struct binary_narrowt_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "vh0,vh0,v0,v0", group, MODE_none);
@@ -1261,7 +1261,7 @@ struct binary_narrowt_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1284,7 +1284,7 @@ SHAPE (binary_narrowt_opt_n)
 struct binary_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0", group, MODE_none);
@@ -1298,7 +1298,7 @@ struct binary_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform_opt_n (2);
   }
@@ -1309,7 +1309,7 @@ SHAPE (binary_opt_n)
 struct binary_pred_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "v0,v0,v0", group, MODE_none);
   }
@@ -1322,20 +1322,20 @@ SHAPE (binary_pred)
 struct binary_rotate_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (2, 1);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_either_or (2, 90, 270);
   }
@@ -1349,7 +1349,7 @@ SHAPE (binary_rotate)
 struct binary_scalar_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "v0,s0,s0", group, MODE_none);
   }
@@ -1362,14 +1362,14 @@ SHAPE (binary_scalar)
 struct binary_to_uint_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "vu0,v0,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (2);
   }
@@ -1383,14 +1383,14 @@ SHAPE (binary_to_uint)
 struct binary_uint_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vu0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1411,14 +1411,14 @@ SHAPE (binary_uint)
 struct binary_uint_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,su0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1440,7 +1440,7 @@ SHAPE (binary_uint_n)
 struct binary_uint_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vu0", group, MODE_none);
@@ -1448,7 +1448,7 @@ struct binary_uint_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1468,14 +1468,14 @@ SHAPE (binary_uint_opt_n)
 struct binary_uint64_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1497,7 +1497,7 @@ SHAPE (binary_uint64_n)
 struct binary_uint64_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vu64", group, MODE_none);
@@ -1505,7 +1505,7 @@ struct binary_uint64_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1522,14 +1522,14 @@ SHAPE (binary_uint64_opt_n)
 struct binary_wide_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vh0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1549,7 +1549,7 @@ SHAPE (binary_wide)
 struct binary_wide_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vh0", group, MODE_none);
@@ -1557,7 +1557,7 @@ struct binary_wide_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1576,7 +1576,7 @@ SHAPE (binary_wide_opt_n)
 struct clast_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0", group, MODE_none);
@@ -1584,7 +1584,7 @@ struct clast_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     if (!r.check_gp_argument (2, i, nargs)
@@ -1615,14 +1615,14 @@ SHAPE (clast)
 struct compare_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "vp,v0,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (2);
   }
@@ -1636,7 +1636,7 @@ SHAPE (compare)
 struct compare_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "vp,v0,v0", group, MODE_none);
@@ -1644,7 +1644,7 @@ struct compare_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform_opt_n (2);
   }
@@ -1655,14 +1655,14 @@ SHAPE (compare_opt_n)
 struct compare_ptr_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "vp,al,al", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1684,14 +1684,14 @@ SHAPE (compare_ptr)
 struct compare_scalar_def : public overloaded_base<1>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "vp,s1,s1", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1714,7 +1714,7 @@ SHAPE (compare_scalar)
 struct compare_wide_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "vp,v0,vw0", group, MODE_none);
@@ -1722,7 +1722,7 @@ struct compare_wide_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1739,7 +1739,7 @@ SHAPE (compare_wide_opt_n)
 struct count_inherent_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "su64", group, MODE_none);
   }
@@ -1750,7 +1750,7 @@ SHAPE (count_inherent)
 struct count_pat_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "su64,epattern", group, MODE_none);
   }
@@ -1761,7 +1761,7 @@ SHAPE (count_pat)
 struct count_pred_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "su64,vp", group, MODE_none);
   }
@@ -1772,14 +1772,14 @@ SHAPE (count_pred)
 struct count_vector_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "su64,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (1);
   }
@@ -1792,14 +1792,14 @@ SHAPE (count_vector)
 struct create_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "t0,v0*t", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (r.vectors_per_tuple ());
   }
@@ -1813,7 +1813,7 @@ SHAPE (create)
 struct dupq_def : public overloaded_base<1>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     /* The "_n" suffix is optional; the full name has it, but the short
        name doesn't.  */
@@ -1821,7 +1821,7 @@ struct dupq_def : public overloaded_base<1>
   }
 
   tree
-  resolve (function_resolver &) const OVERRIDE
+  resolve (function_resolver &) const override
   {
     /* The short forms just make "_n" implicit, so no resolution is needed.  */
     gcc_unreachable ();
@@ -1836,20 +1836,20 @@ SHAPE (dupq)
 struct ext_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (2, 1);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     unsigned int bytes = c.type_suffix (0).element_bytes;
     return c.require_immediate_range (2, 0, 256 / bytes - 1);
@@ -1861,14 +1861,14 @@ SHAPE (ext)
 struct fold_left_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "s0,s0,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1889,14 +1889,14 @@ SHAPE (fold_left)
 struct get_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,t0,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1909,7 +1909,7 @@ struct get_def : public overloaded_base<0>
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     unsigned int nvectors = c.vectors_per_tuple ();
     return c.require_immediate_range (1, 0, nvectors - 1);
@@ -1927,7 +1927,7 @@ struct inc_dec_def : public inc_dec_base
   CONSTEXPR inc_dec_def () : inc_dec_base (false) {}
 
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     /* These functions are unusual in that the type suffixes for
@@ -1952,7 +1952,7 @@ struct inc_dec_pat_def : public inc_dec_base
   CONSTEXPR inc_dec_pat_def () : inc_dec_base (true) {}
 
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     /* These functions are unusual in that the type suffixes for
@@ -1971,14 +1971,14 @@ SHAPE (inc_dec_pat)
 struct inc_dec_pred_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vp", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -1998,14 +1998,14 @@ SHAPE (inc_dec_pred)
 struct inc_dec_pred_scalar_def : public overloaded_base<2>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_n);
     build_all (b, "s0,s0,vp", group, MODE_n);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -2023,7 +2023,7 @@ SHAPE (inc_dec_pred_scalar)
 struct inherent_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "t0", group, MODE_none);
   }
@@ -2034,7 +2034,7 @@ SHAPE (inherent)
 struct inherent_b_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     /* The "_b" suffix is optional; the full name has it, but the short
        name doesn't.  */
@@ -2042,7 +2042,7 @@ struct inherent_b_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &) const OVERRIDE
+  resolve (function_resolver &) const override
   {
     /* The short forms just make "_b" implicit, so no resolution is needed.  */
     gcc_unreachable ();
@@ -2055,7 +2055,7 @@ SHAPE (inherent_b)
 struct load_def : public load_contiguous_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     b.add_overloaded_functions (group, MODE_vnum);
@@ -2072,7 +2072,7 @@ SHAPE (load)
 struct load_ext_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "t0,al", group, MODE_none);
     build_all (b, "t0,al,ss64", group, MODE_vnum);
@@ -2092,7 +2092,7 @@ SHAPE (load_ext)
 struct load_ext_gather_index_def : public load_ext_gather_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_index);
     build_sv_index (b, "t0,al,d", group);
@@ -2112,7 +2112,7 @@ SHAPE (load_ext_gather_index)
 struct load_ext_gather_index_restricted_def : public load_ext_gather_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_index);
     build_sv_index64 (b, "t0,al,d", group);
@@ -2136,7 +2136,7 @@ SHAPE (load_ext_gather_index_restricted)
 struct load_ext_gather_offset_def : public load_ext_gather_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_offset);
     build_sv_offset (b, "t0,al,d", group);
@@ -2161,7 +2161,7 @@ SHAPE (load_ext_gather_offset)
 struct load_ext_gather_offset_restricted_def : public load_ext_gather_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_offset);
     build_sv_uint_offset (b, "t0,al,d", group);
@@ -2183,7 +2183,7 @@ SHAPE (load_ext_gather_offset_restricted)
 struct load_gather_sv_def : public load_gather_sv_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_index);
     b.add_overloaded_functions (group, MODE_offset);
@@ -2205,7 +2205,7 @@ SHAPE (load_gather_sv)
 struct load_gather_sv_restricted_def : public load_gather_sv_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_index);
     b.add_overloaded_functions (group, MODE_offset);
@@ -2226,7 +2226,7 @@ SHAPE (load_gather_sv_restricted)
 struct load_gather_vs_def : public overloaded_base<1>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     /* The base vector mode is optional; the full name has it but the
        short name doesn't.  There is no ambiguity with SHAPE_load_gather_sv
@@ -2237,7 +2237,7 @@ struct load_gather_vs_def : public overloaded_base<1>
   }
 
   tree
-  resolve (function_resolver &) const OVERRIDE
+  resolve (function_resolver &) const override
   {
     /* The short name just makes the base vector mode implicit;
        no resolution is needed.  */
@@ -2252,7 +2252,7 @@ SHAPE (load_gather_vs)
 struct load_replicate_def : public load_contiguous_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "t0,al", group, MODE_none);
@@ -2264,7 +2264,7 @@ SHAPE (load_replicate)
 struct pattern_pred_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "vp,epattern", group, MODE_none);
   }
@@ -2276,7 +2276,7 @@ SHAPE (pattern_pred)
 struct prefetch_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "_,ap,eprfop", group, MODE_none);
     build_all (b, "_,ap,ss64,eprfop", group, MODE_vnum);
@@ -2297,7 +2297,7 @@ SHAPE (prefetch)
 struct prefetch_gather_index_def : public prefetch_gather_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     b.add_overloaded_functions (group, MODE_index);
@@ -2321,7 +2321,7 @@ SHAPE (prefetch_gather_index)
 struct prefetch_gather_offset_def : public prefetch_gather_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     b.add_overloaded_functions (group, MODE_offset);
@@ -2336,7 +2336,7 @@ SHAPE (prefetch_gather_offset)
 struct ptest_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "sp,vp", group, MODE_none);
   }
@@ -2347,7 +2347,7 @@ SHAPE (ptest)
 struct rdffr_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "vp", group, MODE_none);
   }
@@ -2358,14 +2358,14 @@ SHAPE (rdffr)
 struct reduction_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "s0,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (1);
   }
@@ -2381,14 +2381,14 @@ SHAPE (reduction)
 struct reduction_wide_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "sw0,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (1);
   }
@@ -2402,14 +2402,14 @@ SHAPE (reduction_wide)
 struct set_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "t0,t0,su64,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -2423,7 +2423,7 @@ struct set_def : public overloaded_base<0>
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     unsigned int nvectors = c.vectors_per_tuple ();
     return c.require_immediate_range (1, 0, nvectors - 1);
@@ -2435,7 +2435,7 @@ SHAPE (set)
 struct setffr_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "_", group, MODE_none);
   }
@@ -2449,20 +2449,20 @@ SHAPE (setffr)
 struct shift_left_imm_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_n);
     build_all (b, "v0,v0,su64", group, MODE_n);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (1, 1);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     unsigned int bits = c.type_suffix (0).element_bits;
     return c.require_immediate_range (1, 0, bits - 1);
@@ -2477,7 +2477,7 @@ SHAPE (shift_left_imm)
 struct shift_left_imm_long_def : public binary_imm_long_base
 {
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     unsigned int bits = c.type_suffix (0).element_bits / 2;
     return c.require_immediate_range (1, 0, bits - 1);
@@ -2492,7 +2492,7 @@ SHAPE (shift_left_imm_long)
 struct shift_left_imm_to_uint_def : public shift_left_imm_def
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_n);
     build_all (b, "vu0,v0,su64", group, MODE_n);
@@ -2507,20 +2507,20 @@ SHAPE (shift_left_imm_to_uint)
 struct shift_right_imm_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_n);
     build_all (b, "v0,v0,su64", group, MODE_n);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (1, 1);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     unsigned int bits = c.type_suffix (0).element_bits;
     return c.require_immediate_range (1, 1, bits);
@@ -2572,7 +2572,7 @@ SHAPE (shift_right_imm_narrowt_to_uint)
 struct store_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     b.add_overloaded_functions (group, MODE_vnum);
@@ -2581,7 +2581,7 @@ struct store_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     bool vnum_p = r.mode_suffix_id == MODE_vnum;
     gcc_assert (r.mode_suffix_id == MODE_none || vnum_p);
@@ -2612,7 +2612,7 @@ SHAPE (store)
 struct store_scatter_index_def : public store_scatter_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_index);
     build_sv_index (b, "_,as,d,t0", group);
@@ -2632,7 +2632,7 @@ SHAPE (store_scatter_index)
 struct store_scatter_index_restricted_def : public store_scatter_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_index);
     build_sv_index64 (b, "_,as,d,t0", group);
@@ -2657,7 +2657,7 @@ SHAPE (store_scatter_index_restricted)
 struct store_scatter_offset_def : public store_scatter_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     b.add_overloaded_functions (group, MODE_offset);
@@ -2683,7 +2683,7 @@ SHAPE (store_scatter_offset)
 struct store_scatter_offset_restricted_def : public store_scatter_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     b.add_overloaded_functions (group, MODE_offset);
@@ -2698,14 +2698,14 @@ SHAPE (store_scatter_offset_restricted)
 struct tbl_tuple_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,t0,vu0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -2724,7 +2724,7 @@ struct ternary_bfloat_def
   : public ternary_resize2_base<16, TYPE_bfloat, TYPE_bfloat>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vB,vB", group, MODE_none);
@@ -2752,7 +2752,7 @@ struct ternary_bfloat_opt_n_def
   : public ternary_resize2_opt_n_base<16, TYPE_bfloat, TYPE_bfloat>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vB,vB", group, MODE_none);
@@ -2770,7 +2770,7 @@ struct ternary_intq_uintq_lane_def
   : public ternary_qq_lane_base<TYPE_signed, TYPE_unsigned>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vqs0,vqu0,su64", group, MODE_none);
@@ -2786,7 +2786,7 @@ struct ternary_intq_uintq_opt_n_def
 				      TYPE_signed, TYPE_unsigned>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vqs0,vqu0", group, MODE_none);
@@ -2802,20 +2802,20 @@ SHAPE (ternary_intq_uintq_opt_n)
 struct ternary_lane_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0,v0,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (3, 1);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_lane_index (3);
   }
@@ -2830,20 +2830,20 @@ SHAPE (ternary_lane)
 struct ternary_lane_rotate_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0,v0,su64,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (3, 2);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return (c.require_immediate_lane_index (3, 2)
 	    && c.require_immediate_one_of (4, 0, 90, 180, 270));
@@ -2859,14 +2859,14 @@ struct ternary_long_lane_def
   : public ternary_resize2_lane_base<function_resolver::HALF_SIZE>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vh0,vh0,su64", group, MODE_none);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_lane_index (3);
   }
@@ -2883,7 +2883,7 @@ struct ternary_long_opt_n_def
   : public ternary_resize2_opt_n_base<function_resolver::HALF_SIZE>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vh0,vh0", group, MODE_none);
@@ -2900,7 +2900,7 @@ SHAPE (ternary_long_opt_n)
 struct ternary_opt_n_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0,v0", group, MODE_none);
@@ -2908,7 +2908,7 @@ struct ternary_opt_n_def : public overloaded_base<0>
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform_opt_n (3);
   }
@@ -2922,7 +2922,7 @@ SHAPE (ternary_opt_n)
 struct ternary_qq_lane_def : public ternary_qq_lane_base<>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vq0,vq0,su64", group, MODE_none);
@@ -2938,14 +2938,14 @@ SHAPE (ternary_qq_lane)
 struct ternary_qq_lane_rotate_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vq0,vq0,su64,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -2963,7 +2963,7 @@ struct ternary_qq_lane_rotate_def : public overloaded_base<0>
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return (c.require_immediate_lane_index (3, 4)
 	    && c.require_immediate_one_of (4, 0, 90, 180, 270));
@@ -2981,7 +2981,7 @@ struct ternary_qq_opt_n_def
   : public ternary_resize2_opt_n_base<function_resolver::QUARTER_SIZE>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vq0,vq0", group, MODE_none);
@@ -2998,14 +2998,14 @@ SHAPE (ternary_qq_opt_n)
 struct ternary_qq_rotate_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vq0,vq0,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -3022,7 +3022,7 @@ struct ternary_qq_rotate_def : public overloaded_base<0>
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_one_of (3, 0, 90, 180, 270);
   }
@@ -3036,20 +3036,20 @@ SHAPE (ternary_qq_rotate)
 struct ternary_rotate_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0,v0,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (3, 1);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_one_of (3, 0, 90, 180, 270);
   }
@@ -3063,7 +3063,7 @@ SHAPE (ternary_rotate)
 struct ternary_shift_left_imm_def : public ternary_shift_imm_base
 {
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     unsigned int bits = c.type_suffix (0).element_bits;
     return c.require_immediate_range (2, 0, bits - 1);
@@ -3078,7 +3078,7 @@ SHAPE (ternary_shift_left_imm)
 struct ternary_shift_right_imm_def : public ternary_shift_imm_base
 {
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     unsigned int bits = c.type_suffix (0).element_bits;
     return c.require_immediate_range (2, 1, bits);
@@ -3090,14 +3090,14 @@ SHAPE (ternary_shift_right_imm)
 struct ternary_uint_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0,vu0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -3119,7 +3119,7 @@ struct ternary_uintq_intq_def
 				TYPE_unsigned, TYPE_signed>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vqu0,vqs0", group, MODE_none);
@@ -3136,7 +3136,7 @@ struct ternary_uintq_intq_lane_def
   : public ternary_qq_lane_base<TYPE_unsigned, TYPE_signed>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vqu0,vqs0,su64", group, MODE_none);
@@ -3152,7 +3152,7 @@ struct ternary_uintq_intq_opt_n_def
 				      TYPE_unsigned, TYPE_signed>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,vqu0,vqs0", group, MODE_none);
@@ -3168,20 +3168,20 @@ SHAPE (ternary_uintq_intq_opt_n)
 struct tmad_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0,v0,su64", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_uniform (2, 1);
   }
 
   bool
-  check (function_checker &c) const OVERRIDE
+  check (function_checker &c) const override
   {
     return c.require_immediate_range (2, 0, 7);
   }
@@ -3195,14 +3195,14 @@ SHAPE (tmad)
 struct unary_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_unary ();
   }
@@ -3216,14 +3216,14 @@ SHAPE (unary)
 struct unary_convert_def : public overloaded_base<1>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v1", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_unary (r.type_suffix (0).tclass,
 			    r.type_suffix (0).element_bits);
@@ -3239,14 +3239,14 @@ SHAPE (unary_convert)
 struct unary_convert_narrowt_def : public overloaded_base<1>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,v1", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_unary (r.type_suffix (0).tclass,
 			    r.type_suffix (0).element_bits, true);
@@ -3258,14 +3258,14 @@ SHAPE (unary_convert_narrowt)
 struct unary_long_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,vh0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type, result_type;
@@ -3286,7 +3286,7 @@ SHAPE (unary_long)
 struct unary_n_def : public overloaded_base<1>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     /* The "_n" suffix is optional; the full name has it, but the short
        name doesn't.  */
@@ -3294,7 +3294,7 @@ struct unary_n_def : public overloaded_base<1>
   }
 
   tree
-  resolve (function_resolver &) const OVERRIDE
+  resolve (function_resolver &) const override
   {
     /* The short forms just make "_n" implicit, so no resolution is needed.  */
     gcc_unreachable ();
@@ -3322,7 +3322,7 @@ SHAPE (unary_narrowt_to_uint)
 struct unary_pred_def : public nonoverloaded_base
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     build_all (b, "v0,v0", group, MODE_none);
   }
@@ -3336,14 +3336,14 @@ SHAPE (unary_pred)
 struct unary_to_int_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "vs0,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_unary (TYPE_signed);
   }
@@ -3357,14 +3357,14 @@ SHAPE (unary_to_int)
 struct unary_to_uint_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "vu0,v0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     return r.resolve_unary (TYPE_unsigned);
   }
@@ -3378,14 +3378,14 @@ SHAPE (unary_to_uint)
 struct unary_uint_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,vu0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
@@ -3414,14 +3414,14 @@ SHAPE (unary_uint)
 struct unary_widen_def : public overloaded_base<0>
 {
   void
-  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  build (function_builder &b, const function_group_info &group) const override
   {
     b.add_overloaded_functions (group, MODE_none);
     build_all (b, "v0,vh0", group, MODE_none);
   }
 
   tree
-  resolve (function_resolver &r) const OVERRIDE
+  resolve (function_resolver &r) const override
   {
     unsigned int i, nargs;
     type_suffix_index type;
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
index e066f09..c010437 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
@@ -82,7 +82,7 @@ class svaba_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     rtx_code max_code = e.type_suffix (0).unsigned_p ? UMAX : SMAX;
     machine_mode mode = e.vector_mode (0);
@@ -94,7 +94,7 @@ class svcdot_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Convert the rotation amount into a specific unspec.  */
     int rot = INTVAL (e.args.pop ());
@@ -107,7 +107,7 @@ class svcdot_lane_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Convert the rotation amount into a specific unspec.  */
     int rot = INTVAL (e.args.pop ());
@@ -120,13 +120,13 @@ class svldnt1_gather_impl : public full_width_access
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_READ_MEMORY;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.prepare_gather_address_operands (1, false);
     machine_mode mem_mode = e.memory_vector_mode ();
@@ -142,7 +142,7 @@ public:
     : extending_load (memory_type) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.prepare_gather_address_operands (1, false);
     /* Add a constant predicate for the extension rtx.  */
@@ -162,7 +162,7 @@ public:
   CONSTEXPR svmatch_svnmatch_impl (int unspec) : m_unspec (unspec) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* These are UNSPEC_PRED_Z operations and so need a hint operand.  */
     e.add_ptrue_hint (0, e.gp_mode (0));
@@ -185,7 +185,7 @@ public:
   {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.args.quick_push (const0_rtx);
     return e.map_to_unspecs (m_unspec_for_sint, m_unspec_for_uint,
@@ -197,7 +197,7 @@ class svqcadd_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Convert the rotation amount into a specific unspec.  */
     int rot = INTVAL (e.args.pop ());
@@ -213,7 +213,7 @@ class svqrdcmlah_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Convert the rotation amount into a specific unspec.  */
     int rot = INTVAL (e.args.pop ());
@@ -226,7 +226,7 @@ class svqrdcmlah_lane_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     /* Convert the rotation amount into a specific unspec.  */
     int rot = INTVAL (e.args.pop ());
@@ -242,7 +242,7 @@ public:
     : unspec_based_function (UNSPEC_SQRSHL, UNSPEC_UQRSHL, -1) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     if (tree amount = uniform_integer_cst_p (gimple_call_arg (f.call, 2)))
       {
@@ -276,7 +276,7 @@ public:
     : unspec_based_function (UNSPEC_SQSHL, UNSPEC_UQSHL, -1) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     if (tree amount = uniform_integer_cst_p (gimple_call_arg (f.call, 2)))
       {
@@ -312,7 +312,7 @@ public:
     : unspec_based_function (UNSPEC_SRSHL, UNSPEC_URSHL, -1) {}
 
   gimple *
-  fold (gimple_folder &f) const OVERRIDE
+  fold (gimple_folder &f) const override
   {
     if (tree amount = uniform_integer_cst_p (gimple_call_arg (f.call, 2)))
       {
@@ -349,7 +349,7 @@ class svsqadd_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     if (e.pred == PRED_x
@@ -363,7 +363,7 @@ class svsra_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     rtx_code shift_code = e.type_suffix (0).unsigned_p ? LSHIFTRT : ASHIFTRT;
     machine_mode mode = e.vector_mode (0);
@@ -375,13 +375,13 @@ class svstnt1_scatter_impl : public full_width_access
 {
 public:
   unsigned int
-  call_properties (const function_instance &) const OVERRIDE
+  call_properties (const function_instance &) const override
   {
     return CP_WRITE_MEMORY;
   }
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.prepare_gather_address_operands (1, false);
     machine_mode mem_mode = e.memory_vector_mode ();
@@ -397,7 +397,7 @@ public:
     : truncating_store (to_mode) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     e.prepare_gather_address_operands (1, false);
     insn_code icode = code_for_aarch64_scatter_stnt (e.vector_mode (0),
@@ -412,7 +412,7 @@ public:
   CONSTEXPR svtbl2_impl () : quiet<multi_vector_function> (2) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     return e.use_exact_insn (code_for_aarch64_sve2_tbl2 (e.vector_mode (0)));
   }
@@ -422,7 +422,7 @@ class svuqadd_impl : public function_base
 {
 public:
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     machine_mode mode = e.vector_mode (0);
     if (e.pred == PRED_x
@@ -440,7 +440,7 @@ public:
   CONSTEXPR svwhilerw_svwhilewr_impl (int unspec) : m_unspec (unspec) {}
 
   rtx
-  expand (function_expander &e) const OVERRIDE
+  expand (function_expander &e) const override
   {
     for (unsigned int i = 0; i < 2; ++i)
       e.args[i] = e.convert_to_pmode (e.args[i]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 3381726..9d78b27 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -871,20 +871,14 @@ registered_function_hasher::equal (value_type value, const compare_type &key)
 }
 
 sve_switcher::sve_switcher ()
-  : m_old_isa_flags (aarch64_isa_flags)
+  : aarch64_simd_switcher (AARCH64_FL_F16 | AARCH64_FL_SVE)
 {
   /* Changing the ISA flags and have_regs_of_mode should be enough here.
      We shouldn't need to pay the compile-time cost of a full target
      switch.  */
-  aarch64_isa_flags = (AARCH64_FL_FP | AARCH64_FL_SIMD | AARCH64_FL_F16
-		       | AARCH64_FL_SVE);
-
   m_old_maximum_field_alignment = maximum_field_alignment;
   maximum_field_alignment = 0;
 
-  m_old_general_regs_only = TARGET_GENERAL_REGS_ONLY;
-  global_options.x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
-
   memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
 	  sizeof (have_regs_of_mode));
   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
@@ -896,9 +890,6 @@ sve_switcher::~sve_switcher ()
 {
   memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
 	  sizeof (have_regs_of_mode));
-  if (m_old_general_regs_only)
-    global_options.x_target_flags |= MASK_GENERAL_REGS_ONLY;
-  aarch64_isa_flags = m_old_isa_flags;
   maximum_field_alignment = m_old_maximum_field_alignment;
 }
 
@@ -1352,13 +1343,17 @@ function_resolver::infer_vector_or_tuple_type (unsigned int argno,
 			" expects a single SVE vector rather than a tuple",
 			actual, argno + 1, fndecl);
 	    else if (size_i == 0 && type_i != VECTOR_TYPE_svbool_t)
-	      error_at (location, "passing single vector %qT to argument %d"
-			" of %qE, which expects a tuple of %d vectors",
-			actual, argno + 1, fndecl, num_vectors);
+	      /* num_vectors is always != 1, so the singular isn't needed.  */
+	      error_n (location, num_vectors, "%qT%d%qE%d",
+		       "passing single vector %qT to argument %d"
+		       " of %qE, which expects a tuple of %d vectors",
+		       actual, argno + 1, fndecl, num_vectors);
 	    else
-	      error_at (location, "passing %qT to argument %d of %qE, which"
-			" expects a tuple of %d vectors", actual, argno + 1,
-			fndecl, num_vectors);
+	      /* num_vectors is always != 1, so the singular isn't needed.  */
+	      error_n (location, num_vectors, "%qT%d%qE%d",
+		       "passing %qT to argument %d of %qE, which"
+		       " expects a tuple of %d vectors", actual, argno + 1,
+		       fndecl, num_vectors);
 	    return NUM_TYPE_SUFFIXES;
 	  }
       }
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h
index 48cae9a..24594d5 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins.h
@@ -651,16 +651,14 @@ public:
 
 /* RAII class for enabling enough SVE features to define the built-in
    types and implement the arm_sve.h pragma.  */
-class sve_switcher
+class sve_switcher : public aarch64_simd_switcher
 {
 public:
   sve_switcher ();
   ~sve_switcher ();
 
 private:
-  unsigned long m_old_isa_flags;
   unsigned int m_old_maximum_field_alignment;
-  bool m_old_general_regs_only;
   bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
 };
 
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index 3eed700..27da961 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2"
+	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2,neoversen2,demeter"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dbeaaf4..d049f9a 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -306,9 +306,6 @@ static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 
-/* Major revision number of the ARM Architecture implemented by the target.  */
-unsigned aarch64_architecture_version;
-
 /* The processor for which instructions should be scheduled.  */
 enum aarch64_processor aarch64_tune = cortexa53;
 
@@ -519,6 +516,42 @@ static const struct cpu_addrcost_table neoversev1_addrcost_table =
   0 /* imm_offset  */
 };
 
+static const struct cpu_addrcost_table neoversen2_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  2, /* post_modify_ld3_st3  */
+  2, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_addrcost_table demeter_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  2, /* post_modify_ld3_st3  */
+  2, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
 static const struct cpu_regmove_cost generic_regmove_cost =
 {
   1, /* GP2GP  */
@@ -624,6 +657,36 @@ static const struct cpu_regmove_cost a64fx_regmove_cost =
   2 /* FP2FP  */
 };
 
+static const struct cpu_regmove_cost neoversen2_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const struct cpu_regmove_cost neoversev1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const struct cpu_regmove_cost demeter_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
 /* Generic costs for Advanced SIMD vector operations.   */
 static const advsimd_vec_cost generic_advsimd_vector_cost =
 {
@@ -1269,7 +1332,13 @@ static const struct tune_params generic_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   2, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "16:12",	/* function_align.  */
@@ -1298,7 +1367,13 @@ static const struct tune_params cortexa35_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   1, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
@@ -1325,7 +1400,13 @@ static const struct tune_params cortexa53_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   2, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
@@ -1352,7 +1433,13 @@ static const struct tune_params cortexa57_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
@@ -1379,7 +1466,13 @@ static const struct tune_params cortexa72_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
@@ -1406,7 +1499,13 @@ static const struct tune_params cortexa73_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost.  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   2, /* issue_rate.  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
@@ -1435,7 +1534,13 @@ static const struct tune_params exynosm1_tunings =
   &generic_branch_cost,
   &exynosm1_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4,	/* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   3,	/* issue_rate  */
   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
   "4",	/* function_align.  */
@@ -1461,7 +1566,13 @@ static const struct tune_params thunderxt88_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  6, /* memmov_cost  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
   2, /* issue_rate  */
   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
   "8",	/* function_align.  */
@@ -1487,7 +1598,13 @@ static const struct tune_params thunderx_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  6, /* memmov_cost  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
   2, /* issue_rate  */
   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
   "8",	/* function_align.  */
@@ -1514,7 +1631,13 @@ static const struct tune_params tsv110_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4,    /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   4,    /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
@@ -1541,7 +1664,13 @@ static const struct tune_params xgene1_tunings =
   &generic_branch_cost,
   &xgene1_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  6, /* memmov_cost  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
   "16",	/* function_align.  */
@@ -1567,7 +1696,13 @@ static const struct tune_params emag_tunings =
   &generic_branch_cost,
   &xgene1_approx_modes,
   SVE_NOT_IMPLEMENTED,
-  6, /* memmov_cost  */
+  { 6, /* load_int.  */
+    6, /* store_int.  */
+    6, /* load_fp.  */
+    6, /* store_fp.  */
+    6, /* load_pred.  */
+    6 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
   "16",	/* function_align.  */
@@ -1593,7 +1728,13 @@ static const struct tune_params qdf24xx_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate  */
   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
@@ -1622,7 +1763,13 @@ static const struct tune_params saphira_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate  */
   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
@@ -1649,7 +1796,13 @@ static const struct tune_params thunderx2t99_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost.  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate.  */
   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
@@ -1676,7 +1829,13 @@ static const struct tune_params thunderx3t110_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost.  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   6, /* issue_rate.  */
   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
@@ -1703,7 +1862,13 @@ static const struct tune_params neoversen1_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    5, /* load_fp.  */
+    2, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "32:16",	/* function_align.  */
@@ -1729,7 +1894,13 @@ static const struct tune_params ampere1_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_NOT_IMPLEMENTED, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   4, /* issue_rate  */
   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
@@ -1899,12 +2070,18 @@ static const struct tune_params neoversev1_tunings =
 {
   &cortexa76_extra_costs,
   &neoversev1_addrcost_table,
-  &generic_regmove_cost,
+  &neoversev1_regmove_cost,
   &neoversev1_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_256, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "32:16",	/* function_align.  */
@@ -2030,12 +2207,18 @@ static const struct tune_params neoverse512tvb_tunings =
 {
   &cortexa76_extra_costs,
   &neoversev1_addrcost_table,
-  &generic_regmove_cost,
+  &neoversev1_regmove_cost,
   &neoverse512tvb_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_128 | SVE_256, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "32:16",	/* function_align.  */
@@ -2054,16 +2237,176 @@ static const struct tune_params neoverse512tvb_tunings =
   &generic_prefetch_tune
 };
 
+static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  2, /* ld2_st2_permute_cost */
+  2, /* ld3_st3_permute_cost  */
+  3, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  4, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  /* This value is just inherited from the Cortex-A57 table.  */
+  8, /* vec_to_scalar_cost  */
+  /* This depends very much on what the scalar value is and
+     where it comes from.  E.g. some constants take two dependent
+     instructions or a load, while others might be moved from a GPR.
+     4 seems to be a reasonable compromise in practice.  */
+  4, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  /* Although stores have a latency of 2 and compete for the
+     vector pipes, in practice it's better not to model that.  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost neoversen2_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    3, /* ld2_st2_permute_cost  */
+    4, /* ld3_st3_permute_cost  */
+    4, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 15 scalar ADDs could
+       complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
+       completes in 11 cycles, so give it a cost of 15 + 6.  */
+    21, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
+    13, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
+    9, /* reduc_i32_cost  */
+    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
+    2, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 7 scalar FADDs could
+       complete in ~8 cycles and would have a cost of 14.  FADDV
+       completes in 6 cycles, so give it a cost of 14 - 2.  */
+    12, /* reduc_f16_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
+    6, /* reduc_f32_cost  */
+    /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
+    2, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* See the comment above the Advanced SIMD versions.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  10, /* fadda_f16_cost  */
+  6, /* fadda_f32_cost  */
+  4, /* fadda_f64_cost  */
+  /* A strided Advanced SIMD x64 load would take two parallel FP loads
+     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
+     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
+     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 2) to that, to avoid the difference being lost in rounding.
+
+     There is no easy comparison between a strided Advanced SIMD x32 load
+     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+     operation more than a 64-bit gather.  */
+  14, /* gather_load_x32_cost  */
+  12, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  4, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    2, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
+{
+  {
+    {
+      3, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      2, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    3, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  2, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info neoversen2_vec_issue_info =
+{
+  &neoversen2_scalar_issue_info,
+  &neoversen2_advsimd_issue_info,
+  &neoversen2_sve_issue_info
+};
+
+/* Neoverse N2 costs for vector insn classes.  */
+static const struct cpu_vector_cost neoversen2_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversen2_advsimd_vector_cost, /* advsimd  */
+  &neoversen2_sve_vector_cost, /* sve  */
+  &neoversen2_vec_issue_info /* issue_info  */
+};
+
 static const struct tune_params neoversen2_tunings =
 {
   &cortexa76_extra_costs,
-  &generic_addrcost_table,
-  &generic_regmove_cost,
-  &cortexa57_vector_cost,
+  &neoversen2_addrcost_table,
+  &neoversen2_regmove_cost,
+  &neoversen2_vector_cost,
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_128, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    1, /* store_int.  */
+    6, /* load_fp.  */
+    2, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "32:16",	/* function_align.  */
@@ -2076,7 +2419,199 @@ static const struct tune_params neoversen2_tunings =
   2,	/* min_div_recip_mul_df.  */
   0,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
+  &generic_prefetch_tune
+};
+
+static const advsimd_vec_cost demeter_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  2, /* ld2_st2_permute_cost */
+  2, /* ld3_st3_permute_cost  */
+  3, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  4, /* reduc_i8_cost  */
+  4, /* reduc_i16_cost  */
+  2, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  6, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  2, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  /* This value is just inherited from the Cortex-A57 table.  */
+  8, /* vec_to_scalar_cost  */
+  /* This depends very much on what the scalar value is and
+     where it comes from.  E.g. some constants take two dependent
+     instructions or a load, while others might be moved from a GPR.
+     4 seems to be a reasonable compromise in practice.  */
+  4, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  /* Although stores have a latency of 2 and compete for the
+     vector pipes, in practice it's better not to model that.  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost demeter_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    3, /* ld2_st2_permute_cost  */
+    3, /* ld3_st3_permute_cost  */
+    4, /* ld4_st4_permute_cost  */
+    3, /* permute_cost  */
+    /* Theoretically, a reduction involving 15 scalar ADDs could
+       complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
+       completes in 11 cycles, so give it a cost of 15 + 8.  */
+    21, /* reduc_i8_cost  */
+    /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
+    14, /* reduc_i16_cost  */
+    /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
+    7, /* reduc_i32_cost  */
+    /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
+    2, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 7 scalar FADDs could
+       complete in ~6 cycles and would have a cost of 14.  FADDV
+       completes in 8 cycles, so give it a cost of 14 + 2.  */
+    16, /* reduc_f16_cost  */
+    /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
+    8, /* reduc_f32_cost  */
+    /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
+    4, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    8, /* vec_to_scalar_cost  */
+    /* See the comment above the Advanced SIMD versions.  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    4, /* unalign_load_cost  */
+    /* Although stores have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  10, /* fadda_f16_cost  */
+  6, /* fadda_f32_cost  */
+  4, /* fadda_f64_cost  */
+  /* A strided Advanced SIMD x64 load would take two parallel FP loads
+     (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
+     is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
+     (cost 8) and a vec_construct (cost 2).  Add a full vector operation
+     (cost 2) to that, to avoid the difference being lost in rounding.
+
+     There is no easy comparison between a strided Advanced SIMD x32 load
+     and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
+     operation more than a 64-bit gather.  */
+  14, /* gather_load_x32_cost  */
+  12, /* gather_load_x64_cost  */
+  3 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info demeter_scalar_issue_info =
+{
+  3, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  6, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info demeter_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    4, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info demeter_sve_issue_info =
+{
+  {
+    {
+      3, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      4, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    3, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  2, /* pred_ops_per_cycle  */
+  2, /* while_pred_ops  */
+  2, /* int_cmp_pred_ops  */
+  1, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info demeter_vec_issue_info =
+{
+  &demeter_scalar_issue_info,
+  &demeter_advsimd_issue_info,
+  &demeter_sve_issue_info
+};
+
+/* Demeter costs for vector insn classes.  */
+static const struct cpu_vector_cost demeter_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &demeter_advsimd_vector_cost, /* advsimd  */
+  &demeter_sve_vector_cost, /* sve  */
+  &demeter_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params demeter_tunings =
+{
+  &cortexa76_extra_costs,
+  &demeter_addrcost_table,
+  &demeter_regmove_cost,
+  &demeter_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_128, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    6, /* load_fp.  */
+    1, /* store_fp.  */
+    6, /* load_pred.  */
+    2 /* store_pred.  */
+  }, /* memmov_cost.  */
+  5, /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  3,	/* int_reassoc_width.  */
+  6,	/* fp_reassoc_width.  */
+  3,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
   &generic_prefetch_tune
 };
 
@@ -2089,7 +2624,13 @@ static const struct tune_params a64fx_tunings =
   &generic_branch_cost,
   &generic_approx_modes,
   SVE_512, /* sve_width  */
-  4, /* memmov_cost  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
   7, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
   "32",	/* function_align.  */
@@ -2133,7 +2674,6 @@ struct processor
   enum aarch64_processor ident;
   enum aarch64_processor sched_core;
   enum aarch64_arch arch;
-  unsigned architecture_version;
   const uint64_t flags;
   const struct tune_params *const tune;
 };
@@ -2142,9 +2682,9 @@ struct processor
 static const struct processor all_architectures[] =
 {
 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
-  {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
+  {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, FLAGS, NULL},
 #include "aarch64-arches.def"
-  {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
+  {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 };
 
 /* Processor cores implementing AArch64.  */
@@ -2152,23 +2692,13 @@ static const struct processor all_cores[] =
 {
 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,				\
-  all_architectures[AARCH64_ARCH_##ARCH].architecture_version,	\
   FLAGS, &COSTS##_tunings},
 #include "aarch64-cores.def"
-  {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
+  {"generic", generic, cortexa53, AARCH64_ARCH_8A,
     AARCH64_FL_FOR_ARCH8, &generic_tunings},
-  {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
+  {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 };
 
-
-/* Target specification.  These are populated by the -march, -mtune, -mcpu
-   handling code or by target attributes.  */
-static const struct processor *selected_arch;
-static const struct processor *selected_cpu;
-static const struct processor *selected_tune;
-
-enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
-
 /* The current tuning set.  */
 struct tune_params aarch64_tune_params = generic_tunings;
 
@@ -2216,8 +2746,6 @@ static const struct attribute_spec aarch64_attribute_table[] =
   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
 };
 
-#define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
-
 /* An ISA extension in the co-processor and main instruction set space.  */
 struct aarch64_option_extension
 {
@@ -3971,7 +4499,7 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 	if (can_create_pseudo_p ())
 	  tmp_reg = gen_reg_rtx (mode);
 
-	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
+	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
 	emit_insn (gen_add_losym (dest, tmp_reg, imm));
 	return;
       }
@@ -4284,7 +4812,7 @@ aarch64_split_128bit_move (rtx dst, rtx src)
 
   machine_mode mode = GET_MODE (dst);
 
-  gcc_assert (mode == TImode || mode == TFmode);
+  gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 
@@ -9811,8 +10339,8 @@ aarch64_case_values_threshold (void)
   /* Use the specified limit for the number of cases before using jump
      tables at higher optimization levels.  */
   if (optimize > 2
-      && selected_cpu->tune->max_case_values != 0)
-    return selected_cpu->tune->max_case_values;
+      && aarch64_tune_params.max_case_values != 0)
+    return aarch64_tune_params.max_case_values;
   else
     return optimize_size ? 8 : 11;
 }
@@ -10024,6 +10552,7 @@ aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
 {
   return mode == SImode || mode == DImode
 	 || mode == SFmode || mode == DFmode
+	 || mode == SDmode || mode == DDmode
 	 || (aarch64_vector_mode_supported_p (mode)
 	     && (known_eq (GET_MODE_SIZE (mode), 8)
 		 || (known_eq (GET_MODE_SIZE (mode), 16)
@@ -10066,12 +10595,13 @@ aarch64_classify_address (struct aarch64_address_info *info,
   vec_flags &= ~VEC_PARTIAL;
 
   /* On BE, we use load/store pair for all large int mode load/stores.
-     TI/TFmode may also use a load/store pair.  */
+     TI/TF/TDmode may also use a load/store pair.  */
   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
 			    || type == ADDR_QUERY_LDP_STP_N
 			    || mode == TImode
 			    || mode == TFmode
+			    || mode == TDmode
 			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
      corresponds to the actual size of the memory being loaded/stored and the
@@ -10145,7 +10675,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	  info->offset = op1;
 	  info->const_offset = offset;
 
-	  /* TImode and TFmode values are allowed in both pairs of X
+	  /* TImode, TFmode and TDmode values are allowed in both pairs of X
 	     registers and individual Q registers.  The available
 	     address modes are:
 	     X,X: 7-bit signed scaled offset
@@ -10154,7 +10684,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	     When performing the check for pairs of X registers i.e.  LDP/STP
 	     pass down DImode since that is the natural size of the LDP/STP
 	     instruction memory accesses.  */
-	  if (mode == TImode || mode == TFmode)
+	  if (mode == TImode || mode == TFmode || mode == TDmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
 		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
@@ -10277,14 +10807,14 @@ aarch64_classify_address (struct aarch64_address_info *info,
 	  info->offset = XEXP (XEXP (x, 1), 1);
 	  info->const_offset = offset;
 
-	  /* TImode and TFmode values are allowed in both pairs of X
+	  /* TImode, TFmode and TDmode values are allowed in both pairs of X
 	     registers and individual Q registers.  The available
 	     address modes are:
 	     X,X: 7-bit signed scaled offset
 	     Q:   9-bit signed offset
 	     We conservatively require an offset representable in either mode.
 	   */
-	  if (mode == TImode || mode == TFmode)
+	  if (mode == TImode || mode == TFmode || mode == TDmode)
 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
 		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
 
@@ -10446,9 +10976,9 @@ aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
 	 offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
 	 range otherwise to increase opportunities for sharing the base
 	 address of different sizes.  Unaligned accesses use the signed
-	 9-bit range, TImode/TFmode use the intersection of signed
+	 9-bit range, TImode/TFmode/TDmode use the intersection of signed
 	 scaled 7-bit and signed 9-bit offset.  */
-      if (mode == TImode || mode == TFmode)
+      if (mode == TImode || mode == TFmode || mode == TDmode)
 	second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
       else if ((const_offset & (size - 1)) != 0)
 	second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
@@ -10529,7 +11059,7 @@ aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
 		  CONST_DOUBLE_REAL_VALUE (value),
 		  REAL_MODE_FORMAT (mode));
 
-  if (mode == DFmode)
+  if (mode == DFmode || mode == DDmode)
     {
       int order = BYTES_BIG_ENDIAN ? 1 : 0;
       ival = zext_hwi (res[order], 32);
@@ -10570,11 +11100,15 @@ aarch64_float_const_rtx_p (rtx x)
   return false;
 }
 
-/* Return TRUE if rtx X is immediate constant 0.0 */
+/* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
+   Floating Point).  */
 bool
 aarch64_float_const_zero_rtx_p (rtx x)
 {
-  if (GET_MODE (x) == VOIDmode)
+  /* 0.0 in Decimal Floating Point cannot be represented by #0 or
+     zr as our callers expect, so no need to check the actual
+     value if X is of Decimal Floating Point type.  */
+  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
     return false;
 
   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
@@ -10612,7 +11146,7 @@ aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
   else
     return false;
 
-   /* use a 64 bit mode for everything except for DI/DF mode, where we use
+   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
      a 128 bit vector mode.  */
   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
 
@@ -11812,7 +12346,7 @@ aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
   if (IN_RANGE (offset, -256, 0))
     return 0;
 
-  if (mode == TImode || mode == TFmode)
+  if (mode == TImode || mode == TFmode || mode == TDmode)
     return (offset + 0x100) & ~0x1ff;
 
   /* Use 12-bit offset by access size.  */
@@ -11921,7 +12455,9 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
 
   /* Without the TARGET_SIMD instructions we cannot move a Q register
      to a Q register directly.  We need a scratch.  */
-  if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
+  if (REG_P (x)
+      && (mode == TFmode || mode == TImode || mode == TDmode)
+      && mode == GET_MODE (x)
       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
       && reg_class_subset_p (rclass, FP_REGS))
     {
@@ -11929,14 +12465,16 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
       return NO_REGS;
     }
 
-  /* A TFmode or TImode memory access should be handled via an FP_REGS
+  /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
      because AArch64 has richer addressing modes for LDR/STR instructions
      than LDP/STP instructions.  */
   if (TARGET_FLOAT && rclass == GENERAL_REGS
       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
     return FP_REGS;
 
-  if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
+  if (rclass == FP_REGS
+      && (mode == TImode || mode == TFmode || mode == TDmode)
+      && CONSTANT_P(x))
       return GENERAL_REGS;
 
   return NO_REGS;
@@ -13067,9 +13605,9 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 		*cost += extra_cost->ldst.storev;
 	      else if (GET_MODE_CLASS (mode) == MODE_INT)
 		*cost += extra_cost->ldst.store;
-	      else if (mode == SFmode)
+	      else if (mode == SFmode || mode == SDmode)
 		*cost += extra_cost->ldst.storef;
-	      else if (mode == DFmode)
+	      else if (mode == DFmode || mode == DDmode)
 		*cost += extra_cost->ldst.stored;
 
 	      *cost +=
@@ -13193,11 +13731,11 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 	  /* mov[df,sf]_aarch64.  */
 	  if (aarch64_float_const_representable_p (x))
 	    /* FMOV (scalar immediate).  */
-	    *cost += extra_cost->fp[mode == DFmode].fpconst;
+	    *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
 	  else if (!aarch64_float_const_zero_rtx_p (x))
 	    {
 	      /* This will be a load from memory.  */
-	      if (mode == DFmode)
+	      if (mode == DFmode || mode == DDmode)
 		*cost += extra_cost->ldst.loadd;
 	      else
 		*cost += extra_cost->ldst.loadf;
@@ -13223,9 +13761,9 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 	    *cost += extra_cost->ldst.loadv;
 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
 	    *cost += extra_cost->ldst.load;
-	  else if (mode == SFmode)
+	  else if (mode == SFmode || mode == SDmode)
 	    *cost += extra_cost->ldst.loadf;
-	  else if (mode == DFmode)
+	  else if (mode == DFmode || mode == DDmode)
 	    *cost += extra_cost->ldst.loadd;
 
 	  *cost +=
@@ -14474,12 +15012,28 @@ aarch64_register_move_cost (machine_mode mode,
   return regmove_cost->FP2FP;
 }
 
+/* Implements TARGET_MEMORY_MOVE_COST.  */
 static int
-aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
-			  reg_class_t rclass ATTRIBUTE_UNUSED,
-			  bool in ATTRIBUTE_UNUSED)
+aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
 {
-  return aarch64_tune_params.memmov_cost;
+  enum reg_class rclass = (enum reg_class) rclass_i;
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
+      ? reg_classes_intersect_p (rclass, PR_REGS)
+      : reg_class_subset_p (rclass, PR_REGS))
+    return (in
+	    ? aarch64_tune_params.memmov_cost.load_pred
+	    : aarch64_tune_params.memmov_cost.store_pred);
+
+  if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
+      ? reg_classes_intersect_p (rclass, FP_REGS)
+      : reg_class_subset_p (rclass, FP_REGS))
+    return (in
+	    ? aarch64_tune_params.memmov_cost.load_fp
+	    : aarch64_tune_params.memmov_cost.store_fp);
+
+  return (in
+	  ? aarch64_tune_params.memmov_cost.load_int
+	  : aarch64_tune_params.memmov_cost.store_int);
 }
 
 /* Implement TARGET_INIT_BUILTINS.  */
@@ -14970,7 +15524,9 @@ aarch64_vec_op_count::sve_issue_info () const
 fractional_cost
 aarch64_vec_op_count::rename_cycles_per_iter () const
 {
-  if (sve_issue_info () == &neoverse512tvb_sve_issue_info)
+  if (sve_issue_info () == &neoverse512tvb_sve_issue_info
+      || sve_issue_info () == &neoversen2_sve_issue_info
+      || sve_issue_info () == &demeter_sve_issue_info)
     /* + 1 for an addition.  We've already counted a general op for each
        store, so we don't need to account for stores separately.  The branch
        reads no registers and so does not need to be counted either.
@@ -15075,11 +15631,16 @@ private:
   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
 				 unsigned int);
   bool prefer_unrolled_loop () const;
+  unsigned int determine_suggested_unroll_factor ();
 
   /* True if we have performed one-time initialization based on the
      vec_info.  */
   bool m_analyzed_vinfo = false;
 
+  /* This loop uses an average operation that is not supported by SVE, but is
+     supported by Advanced SIMD and SVE2.  */
+  bool m_has_avg = false;
+
   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
        SIMD code.
@@ -16080,6 +16641,21 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	 as one iteration of the SVE loop.  */
       if (where == vect_body && m_unrolled_advsimd_niters)
 	m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
+
+      /* Detect the use of an averaging operation.  */
+      gimple *stmt = stmt_info->stmt;
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt))
+	{
+	  switch (gimple_call_internal_fn (stmt))
+	    {
+	    case IFN_AVG_FLOOR:
+	    case IFN_AVG_CEIL:
+	      m_has_avg = true;
+	    default:
+	      break;
+	    }
+	}
     }
   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
 }
@@ -16163,6 +16739,68 @@ adjust_body_cost_sve (const aarch64_vec_op_count *ops,
   return sve_cycles_per_iter;
 }
 
+unsigned int
+aarch64_vector_costs::determine_suggested_unroll_factor ()
+{
+  bool sve = m_vec_flags & VEC_ANY_SVE;
+  /* If we are trying to unroll an Advanced SIMD main loop that contains
+     an averaging operation that we do not support with SVE and we might use a
+     predicated epilogue, we need to be conservative and block unrolling as
+     this might lead to a less optimal loop for the first and only epilogue
+     using the original loop's vectorization factor.
+     TODO: Remove this constraint when we add support for multiple epilogue
+     vectorization.  */
+  if (!sve && !TARGET_SVE2 && m_has_avg)
+    return 1;
+
+  unsigned int max_unroll_factor = 1;
+  for (auto vec_ops : m_ops)
+    {
+      aarch64_simd_vec_issue_info const *vec_issue
+	= vec_ops.simd_issue_info ();
+      if (!vec_issue)
+	return 1;
+      /* Limit unroll factor to a value adjustable by the user, the default
+	 value is 4. */
+      unsigned int unroll_factor = aarch64_vect_unroll_limit;
+      unsigned int factor
+       = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
+      unsigned int temp;
+
+      /* Sanity check, this should never happen.  */
+      if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
+	return 1;
+
+      /* Check stores.  */
+      if (vec_ops.stores > 0)
+	{
+	  temp = CEIL (factor * vec_issue->stores_per_cycle,
+		       vec_ops.stores);
+	  unroll_factor = MIN (unroll_factor, temp);
+	}
+
+      /* Check loads + stores.  */
+      if (vec_ops.loads > 0)
+	{
+	  temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
+		       vec_ops.loads + vec_ops.stores);
+	  unroll_factor = MIN (unroll_factor, temp);
+	}
+
+      /* Check general ops.  */
+      if (vec_ops.general_ops > 0)
+	{
+	  temp = CEIL (factor * vec_issue->general_ops_per_cycle,
+		       vec_ops.general_ops);
+	  unroll_factor = MIN (unroll_factor, temp);
+	 }
+      max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
+    }
+
+  /* Make sure unroll factor is power of 2.  */
+  return 1 << ceil_log2 (max_unroll_factor);
+}
+
 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
    and return the new cost.  */
 unsigned int
@@ -16299,8 +16937,11 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
   if (loop_vinfo
       && m_vec_flags
       && aarch64_use_new_vector_costs_p ())
-    m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
-					   m_costs[vect_body]);
+    {
+      m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
+					     m_costs[vect_body]);
+      m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+    }
 
   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
      the scalar code in the event of a tie, since there is more chance
@@ -16766,6 +17407,26 @@ initialize_aarch64_tls_size (struct gcc_options *opts)
   return;
 }
 
+/* Return the CPU corresponding to the enum CPU.  */
+
+static const struct processor *
+aarch64_get_tune_cpu (enum aarch64_processor cpu)
+{
+  gcc_assert (cpu != aarch64_none);
+
+  return &all_cores[cpu];
+}
+
+/* Return the architecture corresponding to the enum ARCH.  */
+
+static const struct processor *
+aarch64_get_arch (enum aarch64_arch arch)
+{
+  gcc_assert (arch != aarch64_no_arch);
+
+  return &all_architectures[arch];
+}
+
 /* Parse STRING looking for options in the format:
      string	:: option:string
      option	:: name=substring
@@ -16876,18 +17537,18 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
 void
 aarch64_override_options_internal (struct gcc_options *opts)
 {
-  aarch64_tune_flags = selected_tune->flags;
-  aarch64_tune = selected_tune->sched_core;
+  const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
+  aarch64_tune_flags = tune->flags;
+  aarch64_tune = tune->sched_core;
   /* Make a copy of the tuning parameters attached to the core, which
      we may later overwrite.  */
-  aarch64_tune_params = *(selected_tune->tune);
-  aarch64_architecture_version = selected_arch->architecture_version;
-  if (selected_tune->tune == &generic_tunings)
+  aarch64_tune_params = *(tune->tune);
+  if (tune->tune == &generic_tunings)
     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
 
   if (opts->x_aarch64_override_tune_string)
     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
-				  &aarch64_tune_params);
+				   &aarch64_tune_params);
 
   /* This target defaults to strict volatile bitfields.  */
   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
@@ -17048,13 +17709,6 @@ aarch64_override_options_internal (struct gcc_options *opts)
       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
     opts->x_flag_prefetch_loop_arrays = 1;
 
-  if (opts->x_aarch64_arch_string == NULL)
-    opts->x_aarch64_arch_string = selected_arch->name;
-  if (opts->x_aarch64_cpu_string == NULL)
-    opts->x_aarch64_cpu_string = selected_cpu->name;
-  if (opts->x_aarch64_tune_string == NULL)
-    opts->x_aarch64_tune_string = selected_tune->name;
-
   aarch64_override_options_after_change_1 (opts);
 }
 
@@ -17406,37 +18060,6 @@ aarch64_validate_mtune (const char *str, const struct processor **res)
   return false;
 }
 
-/* Return the CPU corresponding to the enum CPU.
-   If it doesn't specify a cpu, return the default.  */
-
-static const struct processor *
-aarch64_get_tune_cpu (enum aarch64_processor cpu)
-{
-  if (cpu != aarch64_none)
-    return &all_cores[cpu];
-
-  /* The & 0x3f is to extract the bottom 6 bits that encode the
-     default cpu as selected by the --with-cpu GCC configure option
-     in config.gcc.
-     ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
-     flags mechanism should be reworked to make it more sane.  */
-  return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
-}
-
-/* Return the architecture corresponding to the enum ARCH.
-   If it doesn't specify a valid architecture, return the default.  */
-
-static const struct processor *
-aarch64_get_arch (enum aarch64_arch arch)
-{
-  if (arch != aarch64_no_arch)
-    return &all_architectures[arch];
-
-  const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
-
-  return &all_architectures[cpu->arch];
-}
-
 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
 
 static poly_uint16
@@ -17472,13 +18095,9 @@ aarch64_override_options (void)
   uint64_t arch_isa = 0;
   aarch64_isa_flags = 0;
 
-  bool valid_cpu = true;
-  bool valid_tune = true;
-  bool valid_arch = true;
-
-  selected_cpu = NULL;
-  selected_arch = NULL;
-  selected_tune = NULL;
+  const struct processor *cpu = NULL;
+  const struct processor *arch = NULL;
+  const struct processor *tune = NULL;
 
   if (aarch64_harden_sls_string)
     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
@@ -17490,77 +18109,52 @@ aarch64_override_options (void)
      If either of -march or -mtune is given, they override their
      respective component of -mcpu.  */
   if (aarch64_cpu_string)
-    valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
-					&cpu_isa);
+    aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
 
   if (aarch64_arch_string)
-    valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
-					  &arch_isa);
+    aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
 
   if (aarch64_tune_string)
-    valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
+    aarch64_validate_mtune (aarch64_tune_string, &tune);
 
 #ifdef SUBTARGET_OVERRIDE_OPTIONS
   SUBTARGET_OVERRIDE_OPTIONS;
 #endif
 
-  /* If the user did not specify a processor, choose the default
-     one for them.  This will be the CPU set during configuration using
-     --with-cpu, otherwise it is "generic".  */
-  if (!selected_cpu)
+  if (cpu && arch)
     {
-      if (selected_arch)
-	{
-	  selected_cpu = &all_cores[selected_arch->ident];
-	  aarch64_isa_flags = arch_isa;
-	  explicit_arch = selected_arch->arch;
-	}
-      else
-	{
-	  /* Get default configure-time CPU.  */
-	  selected_cpu = aarch64_get_tune_cpu (aarch64_none);
-	  aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
-	}
-
-      if (selected_tune)
-	explicit_tune_core = selected_tune->ident;
-    }
-  /* If both -mcpu and -march are specified check that they are architecturally
-     compatible, warn if they're not and prefer the -march ISA flags.  */
-  else if (selected_arch)
-    {
-      if (selected_arch->arch != selected_cpu->arch)
+      /* If both -mcpu and -march are specified, warn if they are not
+	 architecturally compatible and prefer the -march ISA flags.  */
+      if (arch->arch != cpu->arch)
 	{
 	  warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
 		       aarch64_cpu_string,
 		       aarch64_arch_string);
 	}
+
+      selected_arch = arch->arch;
       aarch64_isa_flags = arch_isa;
-      explicit_arch = selected_arch->arch;
-      explicit_tune_core = selected_tune ? selected_tune->ident
-					  : selected_cpu->ident;
     }
-  else
+  else if (cpu)
     {
-      /* -mcpu but no -march.  */
+      selected_arch = cpu->arch;
       aarch64_isa_flags = cpu_isa;
-      explicit_tune_core = selected_tune ? selected_tune->ident
-					  : selected_cpu->ident;
-      gcc_assert (selected_cpu);
-      selected_arch = &all_architectures[selected_cpu->arch];
-      explicit_arch = selected_arch->arch;
     }
-
-  /* Set the arch as well as we will need it when outputing
-     the .arch directive in assembly.  */
-  if (!selected_arch)
+  else if (arch)
+    {
+      cpu = &all_cores[arch->ident];
+      selected_arch = arch->arch;
+      aarch64_isa_flags = arch_isa;
+    }
+  else
     {
-      gcc_assert (selected_cpu);
-      selected_arch = &all_architectures[selected_cpu->arch];
+      /* No -mcpu or -march specified, so use the default CPU.  */
+      cpu = &all_cores[TARGET_CPU_DEFAULT];
+      selected_arch = cpu->arch;
+      aarch64_isa_flags = cpu->flags;
     }
 
-  if (!selected_tune)
-    selected_tune = selected_cpu;
+  selected_tune = tune ? tune->ident : cpu->ident;
 
   if (aarch64_enable_bti == 2)
     {
@@ -17596,15 +18190,6 @@ aarch64_override_options (void)
   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
     sorry ("return address signing is only supported for %<-mabi=lp64%>");
 
-  /* Make sure we properly set up the explicit options.  */
-  if ((aarch64_cpu_string && valid_cpu)
-       || (aarch64_tune_string && valid_tune))
-    gcc_assert (explicit_tune_core != aarch64_none);
-
-  if ((aarch64_cpu_string && valid_cpu)
-       || (aarch64_arch_string && valid_arch))
-    gcc_assert (explicit_arch != aarch64_no_arch);
-
   /* The pass to insert speculation tracking runs before
      shrink-wrapping and the latter does not know how to update the
      tracking status.  So disable it in this case.  */
@@ -17688,42 +18273,14 @@ initialize_aarch64_code_model (struct gcc_options *opts)
     }
 }
 
-/* Implement TARGET_OPTION_SAVE.  */
-
-static void
-aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts,
-		     struct gcc_options */* opts_set */)
-{
-  ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
-  ptr->x_aarch64_branch_protection_string
-    = opts->x_aarch64_branch_protection_string;
-}
-
 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
    using the information saved in PTR.  */
 
 static void
 aarch64_option_restore (struct gcc_options *opts,
-			struct gcc_options */* opts_set */,
-			struct cl_target_option *ptr)
-{
-  opts->x_explicit_arch = ptr->x_explicit_arch;
-  selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
-  opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
-  if (opts->x_explicit_tune_core == aarch64_none
-      && opts->x_explicit_arch != aarch64_no_arch)
-    selected_tune = &all_cores[selected_arch->ident];
-  else
-    selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
-  opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
-  opts->x_aarch64_branch_protection_string
-    = ptr->x_aarch64_branch_protection_string;
-  if (opts->x_aarch64_branch_protection_string)
-    {
-      aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
-					NULL);
-    }
-
+			struct gcc_options * /* opts_set */,
+			struct cl_target_option * /* ptr */)
+{
   aarch64_override_options_internal (opts);
 }
 
@@ -17733,11 +18290,11 @@ static void
 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
 {
   const struct processor *cpu
-    = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
-  uint64_t isa_flags = ptr->x_aarch64_isa_flags;
-  const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
+    = aarch64_get_tune_cpu (ptr->x_selected_tune);
+  const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
   std::string extension
-    = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
+    = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_isa_flags,
+						  arch->flags);
 
   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
@@ -17850,8 +18407,7 @@ aarch64_handle_attr_arch (const char *str)
   if (parse_res == AARCH64_PARSE_OK)
     {
       gcc_assert (tmp_arch);
-      selected_arch = tmp_arch;
-      explicit_arch = selected_arch->arch;
+      selected_arch = tmp_arch->arch;
       return true;
     }
 
@@ -17861,11 +18417,11 @@ aarch64_handle_attr_arch (const char *str)
 	error ("missing name in %<target(\"arch=\")%> pragma or attribute");
 	break;
       case AARCH64_PARSE_INVALID_ARG:
-	error ("invalid name (%qs) in %<target(\"arch=\")%> pragma or attribute", str);
+	error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
 	aarch64_print_hint_for_arch (str);
 	break;
       case AARCH64_PARSE_INVALID_FEATURE:
-	error ("invalid feature modifier %s of value (%qs) in "
+	error ("invalid feature modifier %s of value %qs in "
 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
 	aarch64_print_hint_for_extensions (invalid_extension);
 	break;
@@ -17889,11 +18445,8 @@ aarch64_handle_attr_cpu (const char *str)
   if (parse_res == AARCH64_PARSE_OK)
     {
       gcc_assert (tmp_cpu);
-      selected_tune = tmp_cpu;
-      explicit_tune_core = selected_tune->ident;
-
-      selected_arch = &all_architectures[tmp_cpu->arch];
-      explicit_arch = selected_arch->arch;
+      selected_tune = tmp_cpu->ident;
+      selected_arch = tmp_cpu->arch;
       return true;
     }
 
@@ -17903,11 +18456,11 @@ aarch64_handle_attr_cpu (const char *str)
 	error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
 	break;
       case AARCH64_PARSE_INVALID_ARG:
-	error ("invalid name (%qs) in %<target(\"cpu=\")%> pragma or attribute", str);
+	error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
 	aarch64_print_hint_for_core (str);
 	break;
       case AARCH64_PARSE_INVALID_FEATURE:
-	error ("invalid feature modifier %s of value (%qs) in "
+	error ("invalid feature modifier %qs of value %qs in "
 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
 	aarch64_print_hint_for_extensions (invalid_extension);
 	break;
@@ -17934,7 +18487,7 @@ aarch64_handle_attr_cpu (const char *str)
 	      " attribute");
        break;
      case AARCH64_PARSE_INVALID_ARG:
-       error ("invalid protection type (%qs) in %<target(\"branch-protection"
+       error ("invalid protection type %qs in %<target(\"branch-protection"
 	      "=\")%> pragma or attribute", err_str);
        break;
      case AARCH64_PARSE_OK:
@@ -17961,15 +18514,14 @@ aarch64_handle_attr_tune (const char *str)
   if (parse_res == AARCH64_PARSE_OK)
     {
       gcc_assert (tmp_tune);
-      selected_tune = tmp_tune;
-      explicit_tune_core = selected_tune->ident;
+      selected_tune = tmp_tune->ident;
       return true;
     }
 
   switch (parse_res)
     {
       case AARCH64_PARSE_INVALID_ARG:
-	error ("invalid name (%qs) in %<target(\"tune=\")%> pragma or attribute", str);
+	error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
 	aarch64_print_hint_for_core (str);
 	break;
       default:
@@ -18014,7 +18566,7 @@ aarch64_handle_attr_isa_flags (char *str)
 	break;
 
       case AARCH64_PARSE_INVALID_FEATURE:
-	error ("invalid feature modifier %s of value (%qs) in "
+	error ("invalid feature modifier %qs of value %qs in "
 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
 	break;
 
@@ -18697,7 +19249,7 @@ aarch64_legitimate_constant_p (machine_mode mode, rtx x)
 {
   /* Support CSE and rematerialization of common constants.  */
   if (CONST_INT_P (x)
-      || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT))
+      || CONST_DOUBLE_P (x))
     return true;
 
   /* Only accept variable-length vector constants if they can be
@@ -19138,6 +19690,18 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = long_double_type_node;
 	  field_ptr_t = long_double_ptr_type_node;
 	  break;
+	case E_SDmode:
+	  field_t = dfloat32_type_node;
+	  field_ptr_t = build_pointer_type (dfloat32_type_node);
+	  break;
+	case E_DDmode:
+	  field_t = dfloat64_type_node;
+	  field_ptr_t = build_pointer_type (dfloat64_type_node);
+	  break;
+	case E_TDmode:
+	  field_t = dfloat128_type_node;
+	  field_ptr_t = build_pointer_type (dfloat128_type_node);
+	  break;
 	case E_HFmode:
 	  field_t = aarch64_fp16_type_node;
 	  field_ptr_t = aarch64_fp16_ptr_type_node;
@@ -19355,6 +19919,7 @@ aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
       a HFA or HVA.  */
 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
+const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
 
 /* Walk down the type tree of TYPE counting consecutive base elements.
    If *MODEP is VOIDmode, then set it to the first valid floating point
@@ -19388,7 +19953,8 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
     case REAL_TYPE:
       mode = TYPE_MODE (type);
       if (mode != DFmode && mode != SFmode
-	  && mode != TFmode && mode != HFmode)
+	  && mode != TFmode && mode != HFmode
+	  && mode != SDmode && mode != DDmode && mode != TDmode)
 	return -1;
 
       if (*modep == VOIDmode)
@@ -19511,6 +20077,28 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
 		    continue;
 		  }
 	      }
+	    /* A zero-width bitfield may affect layout in some
+	       circumstances, but adds no members.  The determination
+	       of whether or not a type is an HFA is performed after
+	       layout is complete, so if the type still looks like an
+	       HFA afterwards, it is still classed as one.  This is
+	       potentially an ABI break for the hard-float ABI.  */
+	    else if (DECL_BIT_FIELD (field)
+		     && integer_zerop (DECL_SIZE (field)))
+	      {
+		/* Prior to GCC-12 these fields were striped early,
+		   hiding them from the back-end entirely and
+		   resulting in the correct behaviour for argument
+		   passing.  Simulate that old behaviour without
+		   generating a warning.  */
+		if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
+		  continue;
+		if (warn_psabi_flags)
+		  {
+		    *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
+		    continue;
+		  }
+	      }
 
 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
 						 warn_psabi_flags);
@@ -19682,7 +20270,9 @@ aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
   machine_mode new_mode = VOIDmode;
   bool composite_p = aarch64_composite_type_p (type, mode);
 
-  if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
+  if ((!composite_p
+       && (GET_MODE_CLASS (mode) == MODE_FLOAT
+	   || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
       || aarch64_short_vector_p (type, mode))
     {
       *count = 1;
@@ -19711,8 +20301,10 @@ aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
 	      && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
 		  != ag_count))
 	    {
-	      const char *url
+	      const char *url10
 		= CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
+	      const char *url12
+		= CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
 	      gcc_assert (alt == -1);
 	      last_reported_type_uid = uid;
 	      /* Use TYPE_MAIN_VARIANT to strip any redundant const
@@ -19721,12 +20313,16 @@ aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
 		inform (input_location, "parameter passing for argument of "
 			"type %qT with %<[[no_unique_address]]%> members "
 			"changed %{in GCC 10.1%}",
-			TYPE_MAIN_VARIANT (type), url);
+			TYPE_MAIN_VARIANT (type), url10);
 	      else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
 		inform (input_location, "parameter passing for argument of "
 			"type %qT when C++17 is enabled changed to match "
 			"C++14 %{in GCC 10.1%}",
-			TYPE_MAIN_VARIANT (type), url);
+			TYPE_MAIN_VARIANT (type), url10);
+	      else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
+		inform (input_location, "parameter passing for argument of "
+			"type %qT changed %{in GCC 12.1%}",
+			TYPE_MAIN_VARIANT (type), url12);
 	    }
 
 	  if (is_ha != NULL) *is_ha = true;
@@ -20071,7 +20667,7 @@ is_madd_op (enum attr_type t1)
     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
   };
 
-  for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
+  for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
     {
       if (t1 == mlatypes[i])
 	return true;
@@ -21838,7 +22434,7 @@ aarch64_declare_function_name (FILE *stream, const char* name,
   gcc_assert (targ_options);
 
   const struct processor *this_arch
-    = aarch64_get_arch (targ_options->x_explicit_arch);
+    = aarch64_get_arch (targ_options->x_selected_arch);
 
   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
   std::string extension
@@ -21857,7 +22453,7 @@ aarch64_declare_function_name (FILE *stream, const char* name,
      useful to readers of the generated asm.  Do it only when it changes
      from function to function and verbose assembly is requested.  */
   const struct processor *this_tune
-    = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
+    = aarch64_get_tune_cpu (targ_options->x_selected_tune);
 
   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
     {
@@ -21943,7 +22539,7 @@ aarch64_start_file (void)
     = TREE_TARGET_OPTION (target_option_default_node);
 
   const struct processor *default_arch
-    = aarch64_get_arch (default_options->x_explicit_arch);
+    = aarch64_get_arch (default_options->x_selected_arch);
   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
   std::string extension
     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
@@ -21994,14 +22590,14 @@ aarch64_emit_unlikely_jump (rtx insn)
   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
 }
 
-/* We store the names of the various atomic helpers in a 5x4 array.
+/* We store the names of the various atomic helpers in a 5x5 array.
    Return the libcall function given MODE, MODEL and NAMES.  */
 
 rtx
 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
 			const atomic_ool_names *names)
 {
-  memmodel model = memmodel_base (INTVAL (model_rtx));
+  memmodel model = memmodel_from_int (INTVAL (model_rtx));
   int mode_idx, model_idx;
 
   switch (mode)
@@ -22041,6 +22637,11 @@ aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
     case MEMMODEL_SEQ_CST:
       model_idx = 3;
       break;
+    case MEMMODEL_SYNC_ACQUIRE:
+    case MEMMODEL_SYNC_RELEASE:
+    case MEMMODEL_SYNC_SEQ_CST:
+      model_idx = 4;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -22053,7 +22654,8 @@ aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
   { "__aarch64_" #B #N "_relax", \
     "__aarch64_" #B #N "_acq", \
     "__aarch64_" #B #N "_rel", \
-    "__aarch64_" #B #N "_acq_rel" }
+    "__aarch64_" #B #N "_acq_rel", \
+    "__aarch64_" #B #N "_sync" }
 
 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
 		 { NULL, NULL, NULL, NULL }
@@ -22578,7 +23180,7 @@ aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
     }
 
   machine_mode vmode;
-  /* use a 64 bit mode for everything except for DI/DF mode, where we use
+  /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
      a 128 bit vector mode.  */
   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
 
@@ -22736,7 +23338,9 @@ struct expand_vec_perm_d
   rtx target, op0, op1;
   vec_perm_indices perm;
   machine_mode vmode;
+  machine_mode op_mode;
   unsigned int vec_flags;
+  unsigned int op_vec_flags;
   bool one_vector_p;
   bool testing_p;
 };
@@ -22971,6 +23575,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
 
   newd.vmode = new_mode;
   newd.vec_flags = VEC_ADVSIMD;
+  newd.op_mode = newd.vmode;
+  newd.op_vec_flags = newd.vec_flags;
   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
@@ -23285,6 +23891,33 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || !d->one_vector_p
+      || d->vec_flags != VEC_SVE_DATA
+      || d->op_vec_flags != VEC_ADVSIMD
+      || d->perm.encoding ().nelts_per_pattern () != 1
+      || !known_eq (d->perm.encoding ().npatterns (),
+		    GET_MODE_NUNITS (d->op_mode))
+      || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
+    return false;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
 /* Try to implement D using SVE SEL instruction.  */
 
 static bool
@@ -23408,6 +24041,8 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
+  gcc_assert (d->op_mode != E_VOIDmode);
+
   /* The pattern matching functions above are written to look for a small
      number to begin the sequence (0, 1, N/2).  If we begin with an index
      from the second operand, we can swap the operands.  */
@@ -23424,30 +24059,39 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
        || d->vec_flags == VEC_SVE_PRED)
       && known_gt (nelt, 1))
     {
-      if (aarch64_evpc_rev_local (d))
-	return true;
-      else if (aarch64_evpc_rev_global (d))
-	return true;
-      else if (aarch64_evpc_ext (d))
-	return true;
-      else if (aarch64_evpc_dup (d))
-	return true;
-      else if (aarch64_evpc_zip (d))
-	return true;
-      else if (aarch64_evpc_uzp (d))
-	return true;
-      else if (aarch64_evpc_trn (d))
-	return true;
-      else if (aarch64_evpc_sel (d))
-	return true;
-      else if (aarch64_evpc_ins (d))
-	return true;
-      else if (aarch64_evpc_reencode (d))
-	return true;
-      if (d->vec_flags == VEC_SVE_DATA)
-	return aarch64_evpc_sve_tbl (d);
-      else if (d->vec_flags == VEC_ADVSIMD)
-	return aarch64_evpc_tbl (d);
+      if (d->vmode == d->op_mode)
+	{
+	  if (aarch64_evpc_rev_local (d))
+	    return true;
+	  else if (aarch64_evpc_rev_global (d))
+	    return true;
+	  else if (aarch64_evpc_ext (d))
+	    return true;
+	  else if (aarch64_evpc_dup (d))
+	    return true;
+	  else if (aarch64_evpc_zip (d))
+	    return true;
+	  else if (aarch64_evpc_uzp (d))
+	    return true;
+	  else if (aarch64_evpc_trn (d))
+	    return true;
+	  else if (aarch64_evpc_sel (d))
+	    return true;
+	  else if (aarch64_evpc_ins (d))
+	    return true;
+	  else if (aarch64_evpc_reencode (d))
+	    return true;
+
+	  if (d->vec_flags == VEC_SVE_DATA)
+	    return aarch64_evpc_sve_tbl (d);
+	  else if (d->vec_flags == VEC_ADVSIMD)
+	    return aarch64_evpc_tbl (d);
+	}
+      else
+	{
+	  if (aarch64_evpc_sve_dup (d))
+	    return true;
+	}
     }
   return false;
 }
@@ -23455,8 +24099,9 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
 
 static bool
-aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
-				  rtx op1, const vec_perm_indices &sel)
+aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+				  rtx target, rtx op0, rtx op1,
+				  const vec_perm_indices &sel)
 {
   struct expand_vec_perm_d d;
 
@@ -23481,6 +24126,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
 		     sel.nelts_per_input ());
   d.vmode = vmode;
   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
+  d.op_mode = op_mode;
+  d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
   d.target = target;
   d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
   if (op0 == op1)
@@ -23855,17 +24502,15 @@ aarch64_expand_cpymem_mops (rtx *operands)
 {
   if (!TARGET_MOPS)
     return false;
-  rtx addr_dst = XEXP (operands[0], 0);
-  rtx addr_src = XEXP (operands[1], 0);
-  rtx sz_reg = operands[2];
-
-  if (!REG_P (sz_reg))
-    sz_reg = force_reg (DImode, sz_reg);
-  if (!REG_P (addr_dst))
-    addr_dst = force_reg (DImode, addr_dst);
-  if (!REG_P (addr_src))
-    addr_src = force_reg (DImode, addr_src);
-  emit_insn (gen_aarch64_cpymemdi (addr_dst, addr_src, sz_reg));
+
+  /* All three registers are changed by the instruction, so each one
+     must be a fresh pseudo.  */
+  rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
+  rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
+  rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
+  rtx src_mem = replace_equiv_address (operands[1], src_addr);
+  rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
+  emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
 
   return true;
 }
@@ -24042,17 +24687,15 @@ aarch64_expand_setmem_mops (rtx *operands)
   if (!TARGET_MOPS)
     return false;
 
-  rtx addr_dst = XEXP (operands[0], 0);
-  rtx sz_reg = operands[1];
+  /* The first two registers are changed by the instruction, so both
+     of them must be a fresh pseudo.  */
+  rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
+  rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
+  rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
   rtx val = operands[2];
-
-  if (!REG_P (sz_reg))
-   sz_reg = force_reg (DImode, sz_reg);
-  if (!REG_P (addr_dst))
-   addr_dst = force_reg (DImode, addr_dst);
-  if (!REG_P (val) && val != CONST0_RTX (QImode))
-   val = force_reg (QImode, val);
-  emit_insn (gen_aarch64_setmemdi (addr_dst, val, sz_reg));
+  if (val != CONST0_RTX (QImode))
+    val = force_reg (QImode, val);
+  emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
   return true;
 }
 
@@ -25400,7 +26043,7 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
     base_off = (off_val_1 + off_val_3) / 2;
   else
     /* However, due to issues with negative LDP/STP offset generation for
-       larger modes, for DF, DI and vector modes. we must not use negative
+       larger modes, for DF, DD, DI and vector modes. we must not use negative
        addresses smaller than 9 signed unadjusted bits can store.  This
        provides the most range in this case.  */
     base_off = off_val_1;
@@ -25678,6 +26321,9 @@ aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
 static bool
 aarch64_scalar_mode_supported_p (scalar_mode mode)
 {
+  if (DECIMAL_FLOAT_MODE_P (mode))
+    return default_decimal_float_supported_p ();
+
   return (mode == HFmode
 	  ? true
 	  : default_scalar_mode_supported_p (mode));
@@ -26814,9 +27460,6 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_OFFLOAD_OPTIONS
 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
 
-#undef TARGET_OPTION_SAVE
-#define TARGET_OPTION_SAVE aarch64_option_save
-
 #undef TARGET_OPTION_RESTORE
 #define TARGET_OPTION_RESTORE aarch64_option_restore
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 27ba4f4..80cfe4b 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -148,9 +148,6 @@
 
 #define PCC_BITFIELD_TYPE_MATTERS	1
 
-/* Major revision number of the ARM Architecture implemented by the target.  */
-extern unsigned aarch64_architecture_version;
-
 /* Instruction tuning/selection flags.  */
 
 /* Bit values used to identify processor capabilities.  */
@@ -278,7 +275,8 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_FOR_ARCH8_R     \
   (AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_V8_R)
 #define AARCH64_FL_FOR_ARCH9       \
-  (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_V9)
+  (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_V9 \
+   | AARCH64_FL_F16)
 
 /* Macros to test ISA flags.  */
 
@@ -814,8 +812,7 @@ enum target_cpus
 
 /* If there is no CPU defined at configure, use generic as default.  */
 #ifndef TARGET_CPU_DEFAULT
-#define TARGET_CPU_DEFAULT \
-  (TARGET_CPU_generic | (AARCH64_CPU_DEFAULT_FLAGS << 6))
+# define TARGET_CPU_DEFAULT TARGET_CPU_generic
 #endif
 
 /* If inserting NOP before a mult-accumulate insn remember to adjust the
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c985250..acec8c1 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1477,11 +1477,11 @@
    (set_attr "arch" "simd,fp16,simd,simd,simd,fp16,simd,*,*,*,*,*")]
 )
 
-(define_insn "*movsf_aarch64"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
-	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
-  "TARGET_FLOAT && (register_operand (operands[0], SFmode)
-    || aarch64_reg_or_fp_zero (operands[1], SFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:SFD 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:SFD 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%0.2s, #0
    fmov\\t%s0, %w1
@@ -1501,11 +1501,11 @@
    (set_attr "arch" "simd,*,*,*,*,simd,*,*,*,*,*,*")]
 )
 
-(define_insn "*movdf_aarch64"
-  [(set (match_operand:DF 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
-	(match_operand:DF 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
-  "TARGET_FLOAT && (register_operand (operands[0], DFmode)
-    || aarch64_reg_or_fp_zero (operands[1], DFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:DFD 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:DFD 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%d0, #0
    fmov\\t%d0, %x1
@@ -1545,13 +1545,13 @@
   }
 )
 
-(define_insn "*movtf_aarch64"
-  [(set (match_operand:TF 0
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:TFD 0
 	 "nonimmediate_operand" "=w,?r ,w ,?r,w,?w,w,m,?r,m ,m")
-	(match_operand:TF 1
+	(match_operand:TFD 1
 	 "general_operand"      " w,?rY,?r,w ,Y,Y ,m,w,m ,?r,Y"))]
-  "TARGET_FLOAT && (register_operand (operands[0], TFmode)
-    || aarch64_reg_or_fp_zero (operands[1], TFmode))"
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    mov\\t%0.16b, %1.16b
    #
@@ -1571,8 +1571,8 @@
 )
 
 (define_split
-   [(set (match_operand:TF 0 "register_operand" "")
-	 (match_operand:TF 1 "nonmemory_operand" ""))]
+   [(set (match_operand:TFD 0 "register_operand" "")
+	 (match_operand:TFD 1 "nonmemory_operand" ""))]
   "reload_completed && aarch64_split_128bit_move_p (operands[0], operands[1])"
   [(const_int 0)]
   {
@@ -1581,16 +1581,29 @@
   }
 )
 
-(define_insn "aarch64_cpymemdi"
-  [(parallel [
-   (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
+(define_expand "aarch64_cpymemdi"
+  [(parallel
+     [(set (match_operand 2) (const_int 0))
+      (clobber (match_dup 3))
+      (clobber (match_dup 4))
+      (set (match_operand 0)
+	   (unspec:BLK [(match_operand 1) (match_dup 2)] UNSPEC_CPYMEM))])]
+  "TARGET_MOPS"
+  {
+    operands[3] = XEXP (operands[0], 0);
+    operands[4] = XEXP (operands[1], 0);
+  }
+)
+
+(define_insn "*aarch64_cpymemdi"
+  [(set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
    (clobber (match_operand:DI 0 "register_operand" "+&r"))
    (clobber (match_operand:DI 1 "register_operand" "+&r"))
    (set (mem:BLK (match_dup 0))
-        (unspec:BLK [(mem:BLK (match_dup 1)) (match_dup 2)] UNSPEC_CPYMEM))])]
- "TARGET_MOPS"
- "cpyfp\t[%x0]!, [%x1]!, %x2!\;cpyfm\t[%x0]!, [%x1]!, %x2!\;cpyfe\t[%x0]!, [%x1]!, %x2!"
- [(set_attr "length" "12")]
+        (unspec:BLK [(mem:BLK (match_dup 1)) (match_dup 2)] UNSPEC_CPYMEM))]
+  "TARGET_MOPS"
+  "cpyfp\t[%x0]!, [%x1]!, %x2!\;cpyfm\t[%x0]!, [%x1]!, %x2!\;cpyfe\t[%x0]!, [%x1]!, %x2!"
+  [(set_attr "length" "12")]
 )
 
 ;; 0 is dst
@@ -1657,16 +1670,28 @@
 }
 )
 
-(define_insn "aarch64_setmemdi"
-  [(parallel [
-   (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
+(define_expand "aarch64_setmemdi"
+  [(parallel
+     [(set (match_operand 2) (const_int 0))
+      (clobber (match_dup 3))
+      (set (match_operand 0)
+	   (unspec:BLK [(match_operand 1)
+			(match_dup 2)] UNSPEC_SETMEM))])]
+  "TARGET_MOPS"
+  {
+    operands[3] = XEXP (operands[0], 0);
+  }
+)
+
+(define_insn "*aarch64_setmemdi"
+  [(set (match_operand:DI 2 "register_operand" "+&r") (const_int 0))
    (clobber (match_operand:DI 0 "register_operand" "+&r"))
    (set (mem:BLK (match_dup 0))
         (unspec:BLK [(match_operand:QI 1 "aarch64_reg_or_zero" "rZ")
-                    (match_dup 2)] UNSPEC_SETMEM))])]
- "TARGET_MOPS"
- "setp\t[%x0]!, %x2!, %x1\;setm\t[%x0]!, %x2!, %x1\;sete\t[%x0]!, %x2!, %x1"
- [(set_attr "length" "12")]
+		     (match_dup 2)] UNSPEC_SETMEM))]
+  "TARGET_MOPS"
+  "setp\t[%x0]!, %x2!, %x1\;setm\t[%x0]!, %x2!, %x1\;sete\t[%x0]!, %x2!, %x1"
+  [(set_attr "length" "12")]
 )
 
 ;; 0 is dst
@@ -4524,7 +4549,11 @@
    (set (match_operand:GPI 0 "register_operand" "=r")
 	(and:GPI (SHIFT:GPI (match_dup 1) (match_dup 2)) (match_dup 3)))]
   ""
-  "ands\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2"
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (<sizen> - UINTVAL (operands[2]));
+  return "ands\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2";
+}
   [(set_attr "type" "logics_shift_imm")]
 )
 
@@ -4541,7 +4570,11 @@
 	(zero_extend:DI (and:SI (SHIFT:SI (match_dup 1) (match_dup 2))
 				(match_dup 3))))]
   ""
-  "ands\\t%w0, %w3, %w1, <SHIFT:shift> %2"
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (32 - UINTVAL (operands[2]));
+  return "ands\\t%w0, %w3, %w1, <SHIFT:shift> %2";
+}
   [(set_attr "type" "logics_shift_imm")]
 )
 
@@ -4552,7 +4585,11 @@
 		      (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
 		     (match_operand:GPI 3 "register_operand" "r")))]
   ""
-  "<LOGICAL:logical>\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2"
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (<sizen> - UINTVAL (operands[2]));
+  return "<LOGICAL:logical>\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2";
+}
   [(set_attr "type" "logic_shift_imm")]
 )
 
@@ -4622,17 +4659,6 @@
   "operands[3] = gen_reg_rtx (<MODE>mode);"
 )
 
-(define_insn "*<optab>_rol<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-	(LOGICAL:GPI (rotate:GPI
-		      (match_operand:GPI 1 "register_operand" "r")
-		      (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
-		     (match_operand:GPI 3 "register_operand" "r")))]
-  ""
-  "<logical>\\t%<w>0, %<w>3, %<w>1, ror #(<sizen> - %2)"
-  [(set_attr "type" "logic_shift_imm")]
-)
-
 ;; zero_extend versions of above
 (define_insn "*<LOGICAL:optab>_<SHIFT:optab>si3_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r")
@@ -4642,19 +4668,11 @@
 		      (match_operand:QI 2 "aarch64_shift_imm_si" "n"))
 		     (match_operand:SI 3 "register_operand" "r"))))]
   ""
-  "<LOGICAL:logical>\\t%w0, %w3, %w1, <SHIFT:shift> %2"
-  [(set_attr "type" "logic_shift_imm")]
-)
-
-(define_insn "*<optab>_rolsi3_uxtw"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-	(zero_extend:DI
-	 (LOGICAL:SI (rotate:SI
-		      (match_operand:SI 1 "register_operand" "r")
-		      (match_operand:QI 2 "aarch64_shift_imm_si" "n"))
-		     (match_operand:SI 3 "register_operand" "r"))))]
-  ""
-  "<logical>\\t%w0, %w3, %w1, ror #(32 - %2)"
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (32 - UINTVAL (operands[2]));
+  return "<LOGICAL:logical>\\t%w0, %w3, %w1, <SHIFT:shift> %2";
+}
   [(set_attr "type" "logic_shift_imm")]
 )
 
@@ -4683,7 +4701,11 @@
 	(not:GPI (SHIFT:GPI (match_operand:GPI 1 "register_operand" "r")
 			    (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))))]
   ""
-  "mvn\\t%<w>0, %<w>1, <shift> %2"
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (<sizen> - UINTVAL (operands[2]));
+  return "mvn\\t%<w>0, %<w>1, <shift> %2";
+}
   [(set_attr "type" "logic_shift_imm")]
 )
 
@@ -4790,7 +4812,28 @@
 		       (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n")))
 		     (match_operand:GPI 3 "register_operand" "r")))]
   ""
-  "<LOGICAL:nlogical>\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2"
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (<sizen> - UINTVAL (operands[2]));
+  return "<LOGICAL:nlogical>\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2";
+}
+  [(set_attr "type" "logic_shift_imm")]
+)
+
+;; Zero-extend version of the above.
+(define_insn "<LOGICAL:optab>_one_cmpl_<SHIFT:optab>sidi_uxtw"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI (LOGICAL:SI (not:SI
+			 (SHIFT:SI
+			  (match_operand:SI 1 "register_operand" "r")
+			  (match_operand:QI 2 "aarch64_shift_imm_si" "n")))
+			 (match_operand:SI 3 "register_operand" "r"))))]
+  ""
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (32 - UINTVAL (operands[2]));
+  return "<LOGICAL:nlogical>\\t%w0, %w3, %w1, <SHIFT:shift> %2";
+}
   [(set_attr "type" "logic_shift_imm")]
 )
 
@@ -4802,7 +4845,11 @@
 		       (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
 		     (match_operand:GPI 3 "register_operand" "r"))))]
   ""
-  "eon\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2"
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (<sizen> - UINTVAL (operands[2]));
+  return "eon\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2";
+}
   [(set_attr "type" "logic_shift_imm")]
 )
 
@@ -4816,7 +4863,11 @@
 		      (match_operand:QI 2 "aarch64_shift_imm_si" "n"))
 		    (match_operand:SI 3 "register_operand" "r")))))]
   ""
-  "eon\\t%w0, %w3, %w1, <SHIFT:shift> %2"
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (32 - UINTVAL (operands[2]));
+  return "eon\\t%w0, %w3, %w1, <SHIFT:shift> %2";
+}
   [(set_attr "type" "logic_shift_imm")]
 )
 
@@ -4834,7 +4885,11 @@
 		  (SHIFT:GPI
 		   (match_dup 1) (match_dup 2))) (match_dup 3)))]
   ""
-  "bics\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2"
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (<sizen> - UINTVAL (operands[2]));
+  return "bics\\t%<w>0, %<w>3, %<w>1, <SHIFT:shift> %2";
+}
   [(set_attr "type" "logics_shift_imm")]
 )
 
@@ -4853,7 +4908,11 @@
 			 (not:SI
 			  (SHIFT:SI (match_dup 1) (match_dup 2))) (match_dup 3))))]
   ""
-  "bics\\t%w0, %w3, %w1, <SHIFT:shift> %2"
+{
+  if (<SHIFT:is_rotl>)
+    operands[2] = GEN_INT (32 - UINTVAL (operands[2]));
+  return "bics\\t%w0, %w3, %w1, <SHIFT:shift> %2";
+}
   [(set_attr "type" "logics_shift_imm")]
 )
 
@@ -4867,7 +4926,11 @@
           (match_operand:GPI 2 "register_operand" "r"))
      (const_int 0)))]
   ""
-  "bics\\t<w>zr, %<w>2, %<w>0, <SHIFT:shift> %1"
+{
+  if (<SHIFT:is_rotl>)
+    operands[1] = GEN_INT (<sizen> - UINTVAL (operands[1]));
+  return "bics\\t<w>zr, %<w>2, %<w>0, <SHIFT:shift> %1";
+}
   [(set_attr "type" "logics_shift_imm")]
 )
 
@@ -5041,7 +5104,11 @@
 		  (match_operand:GPI 2 "register_operand" "r"))
 	(const_int 0)))]
   ""
-  "tst\\t%<w>2, %<w>0, <SHIFT:shift> %1"
+{
+  if (<SHIFT:is_rotl>)
+    operands[1] = GEN_INT (<sizen> - UINTVAL (operands[1]));
+  return "tst\\t%<w>2, %<w>0, <SHIFT:shift> %1";
+}
   [(set_attr "type" "logics_shift_imm")]
 )
 
@@ -5442,10 +5509,22 @@
   [(set_attr "type" "rotate_imm,shift_reg")]
 )
 
-;; zero_extend version of above
+(define_insn "*rol<mode>3_insn"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (rotate:GPI (match_operand:GPI 1 "register_operand" "r")
+                    (match_operand 2 "const_int_operand" "n")))]
+  "UINTVAL (operands[2]) < GET_MODE_BITSIZE (<MODE>mode)"
+{
+  operands[3] = GEN_INT (<sizen> - UINTVAL (operands[2]));
+  return "ror\\t%<w>0, %<w>1, %3";
+}
+  [(set_attr "type" "rotate_imm")]
+)
+
+;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT:SI
+	(zero_extend:DI (SHIFT_no_rotate:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5455,6 +5534,31 @@
   [(set_attr "type" "bfx,shift_reg")]
 )
 
+;; zero_extend version of rotate right
+(define_insn "*rorsi3_insn_uxtw"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+        (zero_extend:DI
+         (rotatert:SI (match_operand:SI 1 "register_operand" "r")
+                    (match_operand 2 "const_int_operand" "n"))))]
+  "UINTVAL (operands[2]) < 32"
+  "ror\\t%w0, %w1, %2"
+  [(set_attr "type" "rotate_imm")]
+)
+
+;; zero_extend version of rotate left
+(define_insn "*rolsi3_insn_uxtw"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+        (zero_extend:DI
+         (rotate:SI (match_operand:SI 1 "register_operand" "r")
+                    (match_operand 2 "const_int_operand" "n"))))]
+  "UINTVAL (operands[2]) < 32"
+{
+  operands[2] = GEN_INT (32 - UINTVAL (operands[2]));
+  return "ror\\t%w0, %w1, %2";
+}
+  [(set_attr "type" "rotate_imm")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
@@ -5537,32 +5641,6 @@
   [(set_attr "type" "rotate_imm")]
 )
 
-(define_insn "*ror<mode>3_insn"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-	(rotate:GPI (match_operand:GPI 1 "register_operand" "r")
-		    (match_operand 2 "const_int_operand" "n")))]
-  "UINTVAL (operands[2]) < GET_MODE_BITSIZE (<MODE>mode)"
-{
-  operands[3] = GEN_INT (<sizen> - UINTVAL (operands[2]));
-  return "ror\\t%<w>0, %<w>1, %3";
-}
-  [(set_attr "type" "rotate_imm")]
-)
-
-;; zero_extend version of the above
-(define_insn "*rorsi3_insn_uxtw"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-	(zero_extend:DI
-	 (rotate:SI (match_operand:SI 1 "register_operand" "r")
-		    (match_operand 2 "const_int_operand" "n"))))]
-  "UINTVAL (operands[2]) < 32"
-{
-  operands[3] = GEN_INT (32 - UINTVAL (operands[2]));
-  return "ror\\t%w0, %w1, %3";
-}
-  [(set_attr "type" "rotate_imm")]
-)
-
 (define_insn "*<ANY_EXTEND:optab><GPI:mode>_ashl<SHORT:mode>"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 	(ANY_EXTEND:GPI
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 98ce9c0..d8e1f42 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -22,13 +22,10 @@ HeaderInclude
 config/aarch64/aarch64-opts.h
 
 TargetVariable
-enum aarch64_processor explicit_tune_core = aarch64_none
+enum aarch64_processor selected_tune = aarch64_none
 
 TargetVariable
-enum aarch64_arch explicit_arch = aarch64_no_arch
-
-TargetSave
-const char *x_aarch64_override_tune_string
+enum aarch64_arch selected_arch = aarch64_no_arch
 
 TargetVariable
 uint64_t aarch64_isa_flags = 0
@@ -36,6 +33,9 @@ uint64_t aarch64_isa_flags = 0
 TargetVariable
 unsigned aarch64_enable_bti = 2
 
+TargetVariable
+enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A
+
 ; The TLS dialect names to use with -mtls-dialect.
 
 Enum
@@ -139,7 +139,7 @@ Target RejectNegative Joined Enum(aarch64_abi) Var(aarch64_abi) Init(AARCH64_ABI
 Generate code that conforms to the specified ABI.
 
 moverride=
-Target RejectNegative ToLower Joined Var(aarch64_override_tune_string)
+Target RejectNegative ToLower Joined Var(aarch64_override_tune_string) Save
 -moverride=<string>	Power users only! Override CPU optimization parameters.
 
 Enum
@@ -292,3 +292,7 @@ Constant memmove size in bytes above which to start using MOPS sequence.
 -param=aarch64-mops-memset-size-threshold=
 Target Joined UInteger Var(aarch64_mops_memset_size_threshold) Init(256) Param
 Constant memset size in bytes from which to start using MOPS sequence.
+
+-param=aarch64-vect-unroll-limit=
+Target Joined UInteger Var(aarch64_vect_unroll_limit) Init(4) Param
+Limit how much the autovectorizer may unroll a loop.
diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index ecd852f..9775a48 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -29,6 +29,8 @@
 
 #include <stdint.h>
 
+#pragma GCC aarch64 "arm_acle.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e72fdf35..1be6a91 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -63,18 +63,25 @@
 ;; Iterator for all 16-bit scalar floating point modes (HF, BF)
 (define_mode_iterator HFBF [HF BF])
 
-;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
-(define_mode_iterator GPF_TF_F16 [HF SF DF TF])
-
 ;; Iterator for all scalar floating point modes suitable for moving, including
-;; special BF type (HF, SF, DF, TF and BF)
-(define_mode_iterator GPF_TF_F16_MOV [HF BF SF DF TF])
+;; special BF type and decimal floating point types (HF, SF, DF, TF, BF,
+;; SD, DD and TD)
+(define_mode_iterator GPF_TF_F16_MOV [HF BF SF DF TF SD DD TD])
+
+;; Iterator for scalar 32bit fp modes (SF, SD)
+(define_mode_iterator SFD [SD SF])
+
+;; Iterator for scalar 64bit fp modes (DF, DD)
+(define_mode_iterator DFD [DD DF])
+
+;; Iterator for scalar 128bit fp modes (TF, TD)
+(define_mode_iterator TFD [TD TF])
 
 ;; Double vector modes.
 (define_mode_iterator VDF [V2SF V4HF])
 
-;; Iterator for all scalar floating point modes (SF, DF and TF)
-(define_mode_iterator GPF_TF [SF DF TF])
+;; Iterator for all scalar floating point modes (SF, DF, TF, SD, DD, and TD)
+(define_mode_iterator GPF_TF [SF DF TF SD DD TD])
 
 ;; Integer Advanced SIMD modes.
 (define_mode_iterator VDQ_I [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
@@ -301,7 +308,7 @@
 ;; 2 and 4 lane SI modes.
 (define_mode_iterator VS [V2SI V4SI])
 
-(define_mode_iterator TX [TI TF])
+(define_mode_iterator TX [TI TF TD])
 
 ;; Advanced SIMD opaque structure modes.
 (define_mode_iterator VSTRUCT [OI CI XI])
@@ -403,10 +410,10 @@
 				  V4x8HF V4x4SF V4x2DF V4x8BF])
 
 ;; Double scalar modes
-(define_mode_iterator DX [DI DF])
+(define_mode_iterator DX [DI DF DD])
 
 ;; Duplicate of the above
-(define_mode_iterator DX2 [DI DF])
+(define_mode_iterator DX2 [DI DF DD])
 
 ;; Single scalar modes
 (define_mode_iterator SX [SI SF])
@@ -2112,7 +2119,10 @@
 ;; -------------------------------------------------------------------
 
 ;; This code iterator allows the various shifts supported on the core
-(define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert])
+(define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
+
+;; This code iterator allows all shifts except for rotates.
+(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
@@ -2242,6 +2252,7 @@
 			 (ashiftrt "ashr")
 			 (lshiftrt "lshr")
 			 (rotatert "rotr")
+			 (rotate   "rotl")
 			 (sign_extend "extend")
 			 (zero_extend "zero_extend")
 			 (sign_extract "extv")
@@ -2331,7 +2342,10 @@
 
 ;; Similar for the instruction mnemonics
 (define_code_attr shift [(ashift "lsl") (ashiftrt "asr")
-			 (lshiftrt "lsr") (rotatert "ror")])
+			 (lshiftrt "lsr") (rotatert "ror") (rotate "ror")])
+;; True if shift is rotate left.
+(define_code_attr is_rotl [(ashift "0") (ashiftrt "0")
+			   (lshiftrt "0") (rotatert "0") (rotate "1")])
 
 ;; Op prefix for shift right and accumulate.
 (define_code_attr sra_op [(ashiftrt "s") (lshiftrt "u")])
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index 75b463d..ba74abc 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -30,10 +30,22 @@ s-aarch64-tune-md: $(srcdir)/config/aarch64/gentune.sh \
 	$(SHELL) $(srcdir)/config/aarch64/gentune.sh \
 		$(srcdir)/config/aarch64/aarch64-cores.def > \
 		tmp-aarch64-tune.md
+ifneq ($(strip $(ENABLE_MAINTAINER_RULES)),)
 	$(SHELL) $(srcdir)/../move-if-change tmp-aarch64-tune.md \
 		$(srcdir)/config/aarch64/aarch64-tune.md
+else
+	@if ! cmp -s tmp-aarch64-tune.md \
+	  $(srcdir)/config/aarch64/aarch64-tune.md; then \
+	  echo "aarch64-tune.md has changed; either"; \
+	  echo "configure with --enable-maintainer-mode"; \
+	  echo "or copy tmp-aarch64-tune.md to $(srcdir)/config/aarch64/aarch64-tune.md"; \
+	  exit 1; \
+	fi
+endif
 	$(STAMP) s-aarch64-tune-md
 
+s-mddeps: s-aarch64-tune-md
+
 aarch64-builtins.o: $(srcdir)/config/aarch64/aarch64-builtins.cc $(CONFIG_H) \
   $(SYSTEM_H) coretypes.h $(TM_H) \
   $(RTL_H) $(TREE_H) expr.h $(TM_P_H) $(RECOG_H) langhooks.h \
diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
index 36a40a1..d917137 100644
--- a/gcc/config/arm/arm-builtins.cc
+++ b/gcc/config/arm/arm-builtins.cc
@@ -1471,7 +1471,7 @@ arm_lookup_simd_builtin_type (machine_mode mode,
 			      enum arm_type_qualifiers q)
 {
   int i;
-  int nelts = sizeof (arm_simd_types) / sizeof (arm_simd_types[0]);
+  int nelts = ARRAY_SIZE (arm_simd_types);
 
   /* Non-poly scalar modes map to standard types not in the table.  */
   if (q != qualifier_poly && !VECTOR_MODE_P (mode))
@@ -1503,7 +1503,7 @@ static void
 arm_init_simd_builtin_types (void)
 {
   int i;
-  int nelts = sizeof (arm_simd_types) / sizeof (arm_simd_types[0]);
+  int nelts = ARRAY_SIZE (arm_simd_types);
   tree tdecl;
 
   /* Poly types are a world of their own.  In order to maintain legacy
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 0d3082b..5a63bc5 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -1638,6 +1638,16 @@ begin cpu cortex-m55
  vendor 41
 end cpu cortex-m55
 
+begin cpu star-mc1
+ cname starmc1
+ tune flags LDSCHED
+ architecture armv8-m.main+dsp+fp
+ option nofp remove ALL_FP
+ option nodsp remove armv7em
+ isa quirk_no_asmcpu quirk_vlldm
+ costs v7m
+end cpu star-mc1
+
 # V8 R-profile implementations.
 begin cpu cortex-r52
  cname cortexr52
diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
index ef0cc5e..e6461ab 100644
--- a/gcc/config/arm/arm-tables.opt
+++ b/gcc/config/arm/arm-tables.opt
@@ -283,6 +283,9 @@ EnumValue
 Enum(processor_type) String(cortex-m55) Value( TARGET_CPU_cortexm55)
 
 EnumValue
+Enum(processor_type) String(star-mc1) Value( TARGET_CPU_starmc1)
+
+EnumValue
 Enum(processor_type) String(cortex-r52) Value( TARGET_CPU_cortexr52)
 
 EnumValue
diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md
index 3422553..abc290e 100644
--- a/gcc/config/arm/arm-tune.md
+++ b/gcc/config/arm/arm-tune.md
@@ -49,6 +49,6 @@
 	cortexa710,cortexx1,neoversen1,
 	cortexa75cortexa55,cortexa76cortexa55,neoversev1,
 	neoversen2,cortexm23,cortexm33,
-	cortexm35p,cortexm55,cortexr52,
-	cortexr52plus"
+	cortexm35p,cortexm55,starmc1,
+	cortexr52,cortexr52plus"
 	(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index e062361..2925907 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -296,8 +296,8 @@ static int arm_cortex_a5_branch_cost (bool, bool);
 static int arm_cortex_m_branch_cost (bool, bool);
 static int arm_cortex_m7_branch_cost (bool, bool);
 
-static bool arm_vectorize_vec_perm_const (machine_mode, rtx, rtx, rtx,
-					  const vec_perm_indices &);
+static bool arm_vectorize_vec_perm_const (machine_mode, machine_mode, rtx, rtx,
+					  rtx, const vec_perm_indices &);
 
 static bool aarch_macro_fusion_pair_p (rtx_insn*, rtx_insn*);
 
@@ -6194,7 +6194,7 @@ arm_pcs_from_attribute (tree attr)
    specification, DECL is the specific declartion.  DECL may be null if
    the call could be indirect or if this is a library call.  */
 static enum arm_pcs
-arm_get_pcs_model (const_tree type, const_tree decl)
+arm_get_pcs_model (const_tree type, const_tree decl ATTRIBUTE_UNUSED)
 {
   bool user_convention = false;
   enum arm_pcs user_pcs = arm_pcs_default;
@@ -6228,6 +6228,14 @@ arm_get_pcs_model (const_tree type, const_tree decl)
 	return ARM_PCS_AAPCS;
       else if (user_convention)
 	return user_pcs;
+#if 0
+      /* Unfortunately, this is not safe and can lead to wrong code
+	 being generated (PR96882).  Not all calls into the back-end
+	 pass the DECL, so it is unsafe to make any PCS-changing
+	 decisions based on it.  In particular the RETURN_IN_MEMORY
+	 hook is only ever passed a TYPE.  This needs revisiting to
+	 see if there are any partial improvements that can be
+	 re-enabled.  */
       else if (decl && flag_unit_at_a_time)
 	{
 	  /* Local functions never leak outside this compilation unit,
@@ -6239,6 +6247,7 @@ arm_get_pcs_model (const_tree type, const_tree decl)
 	  if (local_info_node && local_info_node->local)
 	    return ARM_PCS_AAPCS_LOCAL;
 	}
+#endif
     }
   else if (user_convention && user_pcs != arm_pcs_default)
     sorry ("PCS variant");
@@ -6274,6 +6283,7 @@ aapcs_vfp_cum_init (CUMULATIVE_ARGS *pcum  ATTRIBUTE_UNUSED,
       a HFA or HVA.  */
 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
+const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
 
 /* Walk down the type tree of TYPE counting consecutive base elements.
    If *MODEP is VOIDmode, then set it to the first valid floating point
@@ -6426,6 +6436,28 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
 		    continue;
 		  }
 	      }
+	    /* A zero-width bitfield may affect layout in some
+	       circumstances, but adds no members.  The determination
+	       of whether or not a type is an HFA is performed after
+	       layout is complete, so if the type still looks like an
+	       HFA afterwards, it is still classed as one.  This is
+	       potentially an ABI break for the hard-float ABI.  */
+	    else if (DECL_BIT_FIELD (field)
+		     && integer_zerop (DECL_SIZE (field)))
+	      {
+		/* Prior to GCC-12 these fields were striped early,
+		   hiding them from the back-end entirely and
+		   resulting in the correct behaviour for argument
+		   passing.  Simulate that old behaviour without
+		   generating a warning.  */
+		if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
+		  continue;
+		if (warn_psabi_flags)
+		  {
+		    *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
+		    continue;
+		  }
+	      }
 
 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
 						 warn_psabi_flags);
@@ -6538,8 +6570,10 @@ aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
 	      && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
 		  != ag_count))
 	    {
-	      const char *url
+	      const char *url10
 		= CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
+	      const char *url12
+		= CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
 	      gcc_assert (alt == -1);
 	      last_reported_type_uid = uid;
 	      /* Use TYPE_MAIN_VARIANT to strip any redundant const
@@ -6548,12 +6582,16 @@ aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
 		inform (input_location, "parameter passing for argument of "
 			"type %qT with %<[[no_unique_address]]%> members "
 			"changed %{in GCC 10.1%}",
-			TYPE_MAIN_VARIANT (type), url);
+			TYPE_MAIN_VARIANT (type), url10);
 	      else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
 		inform (input_location, "parameter passing for argument of "
 			"type %qT when C++17 is enabled changed to match "
 			"C++14 %{in GCC 10.1%}",
-			TYPE_MAIN_VARIANT (type), url);
+			TYPE_MAIN_VARIANT (type), url10);
+	      else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
+		inform (input_location, "parameter passing for argument of "
+			"type %qT changed %{in GCC 12.1%}",
+			TYPE_MAIN_VARIANT (type), url12);
 	    }
 	  *count = ag_count;
 	}
@@ -10163,6 +10201,61 @@ arm_mem_costs (rtx x, const struct cpu_cost_table *extra_cost,
   return true;
 }
 
+/* Helper for arm_bfi_p.  */
+static bool
+arm_bfi_1_p (rtx op0, rtx op1, rtx *sub0, rtx *sub1)
+{
+  unsigned HOST_WIDE_INT const1;
+  unsigned HOST_WIDE_INT const2 = 0;
+
+  if (!CONST_INT_P (XEXP (op0, 1)))
+    return false;
+
+  const1 = UINTVAL (XEXP (op0, 1));
+  if (!CONST_INT_P (XEXP (op1, 1))
+      || ~UINTVAL (XEXP (op1, 1)) != const1)
+    return false;
+
+  if (GET_CODE (XEXP (op0, 0)) == ASHIFT
+      && CONST_INT_P (XEXP (XEXP (op0, 0), 1)))
+    {
+      const2 = UINTVAL (XEXP (XEXP (op0, 0), 1));
+      *sub0 = XEXP (XEXP (op0, 0), 0);
+    }
+  else
+    *sub0 = XEXP (op0, 0);
+
+  if (const2 >= GET_MODE_BITSIZE (GET_MODE (op0)))
+    return false;
+
+  *sub1 = XEXP (op1, 0);
+  return exact_log2 (const1 + (HOST_WIDE_INT_1U << const2)) >= 0;
+}
+
+/* Recognize a BFI idiom.  Helper for arm_rtx_costs_internal.  The
+   format looks something like:
+
+   (IOR (AND (reg1) (~const1))
+	(AND (ASHIFT (reg2) (const2))
+	     (const1)))
+
+   where const1 is a consecutive sequence of 1-bits with the
+   least-significant non-zero bit starting at bit position const2.  If
+   const2 is zero, then the shift will not appear at all, due to
+   canonicalization.  The two arms of the IOR expression may be
+   flipped.  */
+static bool
+arm_bfi_p (rtx x, rtx *sub0, rtx *sub1)
+{
+  if (GET_CODE (x) != IOR)
+    return false;
+  if (GET_CODE (XEXP (x, 0)) != AND
+      || GET_CODE (XEXP (x, 1)) != AND)
+    return false;
+  return (arm_bfi_1_p (XEXP (x, 0), XEXP (x, 1), sub0, sub1)
+	  || arm_bfi_1_p (XEXP (x, 1), XEXP (x, 0), sub1, sub0));
+}
+
 /* RTX costs.  Make an estimate of the cost of executing the operation
    X, which is contained within an operation with code OUTER_CODE.
    SPEED_P indicates whether the cost desired is the performance cost,
@@ -10921,14 +11014,28 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
       *cost = LIBCALL_COST (2);
       return false;
     case IOR:
-      if (mode == SImode && arm_arch6 && aarch_rev16_p (x))
-        {
-          if (speed_p)
-            *cost += extra_cost->alu.rev;
+      {
+	rtx sub0, sub1;
+	if (mode == SImode && arm_arch6 && aarch_rev16_p (x))
+	  {
+	    if (speed_p)
+	      *cost += extra_cost->alu.rev;
 
-          return true;
-        }
-    /* Fall through.  */
+	    return true;
+	  }
+	else if (mode == SImode && arm_arch_thumb2
+		 && arm_bfi_p (x, &sub0, &sub1))
+	  {
+	    *cost += rtx_cost (sub0, mode, ZERO_EXTRACT, 1, speed_p);
+	    *cost += rtx_cost (sub1, mode, ZERO_EXTRACT, 0, speed_p);
+	    if (speed_p)
+	      *cost += extra_cost->alu.bfi;
+
+	    return true;
+	  }
+      }
+
+      /* Fall through.  */
     case AND: case XOR:
       if (mode == SImode)
 	{
@@ -12811,6 +12918,9 @@ simd_valid_immediate (rtx op, machine_mode mode, int inverse,
 	  || n_elts * innersize != 16))
     return -1;
 
+  if (!TARGET_HAVE_MVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+    return -1;
+
   /* Vectors of float constants.  */
   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
     {
@@ -13486,7 +13596,7 @@ mve_vector_mem_operand (machine_mode mode, rtx op, bool strict)
       int reg_no = REGNO (op);
       return (((mode == E_V8QImode || mode == E_V4QImode || mode == E_V4HImode)
 	       ? reg_no <= LAST_LO_REGNUM
-	       :(reg_no < LAST_ARM_REGNUM && reg_no != SP_REGNUM))
+	       : reg_no < LAST_ARM_REGNUM)
 	      || (!strict && reg_no >= FIRST_PSEUDO_REGISTER));
     }
   code = GET_CODE (op);
@@ -13495,10 +13605,10 @@ mve_vector_mem_operand (machine_mode mode, rtx op, bool strict)
       || code == PRE_INC || code == POST_DEC)
     {
       reg_no = REGNO (XEXP (op, 0));
-      return ((mode == E_V8QImode || mode == E_V4QImode || mode == E_V4HImode)
-	      ? reg_no <= LAST_LO_REGNUM
-	      :(reg_no < LAST_ARM_REGNUM && reg_no != SP_REGNUM))
-	|| reg_no >= FIRST_PSEUDO_REGISTER;
+      return (((mode == E_V8QImode || mode == E_V4QImode || mode == E_V4HImode)
+	       ? reg_no <= LAST_LO_REGNUM
+	       :(reg_no < LAST_ARM_REGNUM && reg_no != SP_REGNUM))
+	      || (!strict && reg_no >= FIRST_PSEUDO_REGISTER));
     }
   else if (((code == POST_MODIFY || code == PRE_MODIFY)
 	    && GET_CODE (XEXP (op, 1)) == PLUS
@@ -13539,10 +13649,11 @@ mve_vector_mem_operand (machine_mode mode, rtx op, bool strict)
 	  default:
 	    return FALSE;
 	}
-      return reg_no >= FIRST_PSEUDO_REGISTER
-	|| (MVE_STN_LDW_MODE (mode)
-	    ? reg_no <= LAST_LO_REGNUM
-	    : (reg_no < LAST_ARM_REGNUM && reg_no != SP_REGNUM));
+      return ((!strict && reg_no >= FIRST_PSEUDO_REGISTER)
+	      || (MVE_STN_LDW_MODE (mode)
+		  ? reg_no <= LAST_LO_REGNUM
+		  : (reg_no < LAST_ARM_REGNUM
+		     && (code == PLUS || reg_no != SP_REGNUM))));
     }
   return FALSE;
 }
@@ -15633,13 +15744,21 @@ gen_cpymem_ldrd_strd (rtx *operands)
     {
       len -= 8;
       reg0 = gen_reg_rtx (DImode);
-      rtx low_reg = NULL_RTX;
-      rtx hi_reg = NULL_RTX;
+      rtx first_reg = NULL_RTX;
+      rtx second_reg = NULL_RTX;
 
       if (!src_aligned || !dst_aligned)
 	{
-	  low_reg = gen_lowpart (SImode, reg0);
-	  hi_reg = gen_highpart_mode (SImode, DImode, reg0);
+	  if (BYTES_BIG_ENDIAN)
+	    {
+	      second_reg = gen_lowpart (SImode, reg0);
+	      first_reg = gen_highpart_mode (SImode, DImode, reg0);
+	    }
+	  else
+	    {
+	      first_reg = gen_lowpart (SImode, reg0);
+	      second_reg = gen_highpart_mode (SImode, DImode, reg0);
+	    }
 	}
       if (MEM_ALIGN (src) >= 2 * BITS_PER_WORD)
 	emit_move_insn (reg0, src);
@@ -15647,9 +15766,9 @@ gen_cpymem_ldrd_strd (rtx *operands)
 	emit_insn (gen_unaligned_loaddi (reg0, src));
       else
 	{
-	  emit_insn (gen_unaligned_loadsi (low_reg, src));
+	  emit_insn (gen_unaligned_loadsi (first_reg, src));
 	  src = next_consecutive_mem (src);
-	  emit_insn (gen_unaligned_loadsi (hi_reg, src));
+	  emit_insn (gen_unaligned_loadsi (second_reg, src));
 	}
 
       if (MEM_ALIGN (dst) >= 2 * BITS_PER_WORD)
@@ -15658,9 +15777,9 @@ gen_cpymem_ldrd_strd (rtx *operands)
 	emit_insn (gen_unaligned_storedi (dst, reg0));
       else
 	{
-	  emit_insn (gen_unaligned_storesi (dst, low_reg));
+	  emit_insn (gen_unaligned_storesi (dst, first_reg));
 	  dst = next_consecutive_mem (dst);
-	  emit_insn (gen_unaligned_storesi (dst, hi_reg));
+	  emit_insn (gen_unaligned_storesi (dst, second_reg));
 	}
 
       src = next_consecutive_mem (src);
@@ -23738,8 +23857,8 @@ arm_print_condition (FILE *stream)
 /* Globally reserved letters: acln
    Puncutation letters currently used: @_|?().!#
    Lower case letters currently used: bcdefhimpqtvwxyz
-   Upper case letters currently used: ABCDEFGHIJKLMNOPQRSTU
-   Letters previously used, but now deprecated/obsolete: sVWXYZ.
+   Upper case letters currently used: ABCDEFGHIJKLMNOPQRSTUV
+   Letters previously used, but now deprecated/obsolete: sWXYZ.
 
    Note that the global reservation for 'c' is only for CONSTANT_ADDRESS_P.
 
@@ -23755,7 +23874,10 @@ arm_print_condition (FILE *stream)
    If CODE is 'N' then X is a floating point operand that must be negated
    before output.
    If CODE is 'B' then output a bitwise inverted value of X (a const int).
-   If X is a REG and CODE is `M', output a ldm/stm style multi-reg.  */
+   If X is a REG and CODE is `M', output a ldm/stm style multi-reg.
+   If CODE is 'V', then the operand must be a CONST_INT representing
+   the bits to preserve in the modified register (Rd) of a BFI or BFC
+   instruction: print out both the width and lsb (shift) fields.  */
 static void
 arm_print_operand (FILE *stream, rtx x, int code)
 {
@@ -24064,8 +24186,27 @@ arm_print_operand (FILE *stream, rtx x, int code)
 	     stream);
       return;
 
-    case 's':
     case 'V':
+      {
+	/* Output the LSB (shift) and width for a bitmask instruction
+	   based on a literal mask.  The LSB is printed first,
+	   followed by the width.
+
+	   Eg. For 0b1...1110001, the result is #1, #3.  */
+	if (!CONST_INT_P (x))
+	  {
+	    output_operand_lossage ("invalid operand for code '%c'", code);
+	    return;
+	  }
+
+	unsigned HOST_WIDE_INT val = ~XUINT (x, 0);
+	int lsb = exact_log2 (val & -val);
+	asm_fprintf (stream, "#%d, #%d", lsb,
+		     (exact_log2 (val + (val & -val)) - lsb));
+      }
+      return;
+
+    case 's':
     case 'W':
     case 'X':
     case 'Y':
@@ -31771,9 +31912,13 @@ arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
 
 static bool
-arm_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1,
+arm_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+			      rtx target, rtx op0, rtx op1,
 			      const vec_perm_indices &sel)
 {
+  if (vmode != op_mode)
+    return false;
+
   struct expand_vec_perm_d d;
   int i, nelt, which;
 
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 60468f6..69bf343 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -3002,30 +3002,36 @@
 
 ; ??? Check split length for Thumb-2
 (define_insn_and_split "*arm_andsi3_insn"
-  [(set (match_operand:SI         0 "s_register_operand" "=r,l,r,r,r")
-	(and:SI (match_operand:SI 1 "s_register_operand" "%r,0,r,r,r")
-		(match_operand:SI 2 "reg_or_int_operand" "I,l,K,r,?n")))]
+  [(set (match_operand:SI         0 "s_register_operand" "=r,l,r,r,r,r")
+	(and:SI (match_operand:SI 1 "s_register_operand" "%r,0,r,r,0,r")
+		(match_operand:SI 2 "reg_or_int_operand" "I,l,K,r,Dj,?n")))]
   "TARGET_32BIT"
   "@
    and%?\\t%0, %1, %2
    and%?\\t%0, %1, %2
    bic%?\\t%0, %1, #%B2
    and%?\\t%0, %1, %2
+   bfc%?\\t%0, %V2
    #"
   "TARGET_32BIT
    && CONST_INT_P (operands[2])
    && !(const_ok_for_arm (INTVAL (operands[2]))
-	|| const_ok_for_arm (~INTVAL (operands[2])))"
+	|| const_ok_for_arm (~INTVAL (operands[2]))
+	|| (arm_arch_thumb2
+	    && satisfies_constraint_Dj (operands[2])
+	    && (rtx_equal_p (operands[0], operands[1])
+		|| !reload_completed)))"
   [(clobber (const_int 0))]
   "
-  arm_split_constant  (AND, SImode, curr_insn, 
+  arm_split_constant  (AND, SImode, curr_insn,
 	               INTVAL (operands[2]), operands[0], operands[1], 0);
   DONE;
   "
-  [(set_attr "length" "4,4,4,4,16")
+  [(set_attr "length" "4,4,4,4,4,16")
    (set_attr "predicable" "yes")
-   (set_attr "predicable_short_it" "no,yes,no,no,no")
-   (set_attr "type" "logic_imm,logic_imm,logic_reg,logic_reg,logic_imm")]
+   (set_attr "predicable_short_it" "no,yes,no,no,no,no")
+   (set_attr "arch" "*,*,*,*,v6t2,*")
+   (set_attr "type" "logic_imm,logic_imm,logic_reg,logic_reg,bfm,logic_imm")]
 )
 
 (define_insn "*andsi3_compare0"
@@ -3471,13 +3477,25 @@
   }"
 )
 
-(define_insn "insv_zero"
+(define_insn_and_split "insv_zero"
   [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r")
                          (match_operand:SI 1 "const_int_M_operand" "M")
                          (match_operand:SI 2 "const_int_M_operand" "M"))
         (const_int 0))]
   "arm_arch_thumb2"
   "bfc%?\t%0, %2, %1"
+  ""
+  [(set (match_dup 0) (and:SI (match_dup 0) (match_dup 1)))]
+  {
+    /* Convert back to a normal AND operation, so that we can take advantage
+       of BIC and AND when appropriate; we'll still emit BFC if that's the
+       right thing to do.  */
+    unsigned HOST_WIDE_INT width = UINTVAL (operands[1]);
+    unsigned HOST_WIDE_INT lsb = UINTVAL (operands[2]);
+    unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << width) - 1;
+
+    operands[1] = gen_int_mode (~(mask << lsb), SImode);
+  }
   [(set_attr "length" "4")
    (set_attr "predicable" "yes")
    (set_attr "type" "bfm")]
@@ -3495,6 +3513,76 @@
    (set_attr "type" "bfm")]
 )
 
+(define_insn "*bfi"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 1 "s_register_operand" "0")
+			(match_operand 2 "const_int_operand" "Dj"))
+		(and:SI (ashift:SI
+			 (match_operand:SI 3 "s_register_operand" "r")
+			 (match_operand 4 "const_int_operand" "i"))
+			(match_operand 5 "const_int_operand" "i"))))]
+  "arm_arch_thumb2
+   && UINTVAL (operands[4]) < 32
+   && UINTVAL (operands[2]) == ~UINTVAL (operands[5])
+   && (exact_log2 (UINTVAL (operands[5])
+		   + (HOST_WIDE_INT_1U << UINTVAL (operands[4])))
+       >= 0)"
+  "bfi%?\t%0, %3, %V2"
+  [(set_attr "length" "4")
+   (set_attr "predicable" "yes")
+   (set_attr "type" "bfm")]
+)
+
+(define_insn "*bfi_alt1"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+	(ior:SI (and:SI (ashift:SI
+			 (match_operand:SI 3 "s_register_operand" "r")
+			 (match_operand 4 "const_int_operand" "i"))
+			(match_operand 5 "const_int_operand" "i"))
+		(and:SI (match_operand:SI 1 "s_register_operand" "0")
+			(match_operand 2 "const_int_operand" "Dj"))))]
+  "arm_arch_thumb2
+   && UINTVAL (operands[4]) < 32
+   && UINTVAL (operands[2]) == ~UINTVAL (operands[5])
+   && (exact_log2 (UINTVAL (operands[5])
+		   + (HOST_WIDE_INT_1U << UINTVAL (operands[4])))
+       >= 0)"
+  "bfi%?\t%0, %3, %V2"
+  [(set_attr "length" "4")
+   (set_attr "predicable" "yes")
+   (set_attr "type" "bfm")]
+)
+
+(define_insn "*bfi_alt2"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 1 "s_register_operand" "0")
+			(match_operand 2 "const_int_operand" "i"))
+		(and:SI (match_operand:SI 3 "s_register_operand" "r")
+			(match_operand 4 "const_int_operand" "i"))))]
+  "arm_arch_thumb2
+   && UINTVAL (operands[2]) == ~UINTVAL (operands[4])
+   && exact_log2 (UINTVAL (operands[4]) + 1) >= 0"
+  "bfi%?\t%0, %3, %V2"
+  [(set_attr "length" "4")
+   (set_attr "predicable" "yes")
+   (set_attr "type" "bfm")]
+)
+
+(define_insn "*bfi_alt3"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 3 "s_register_operand" "r")
+			(match_operand 4 "const_int_operand" "i"))
+		(and:SI (match_operand:SI 1 "s_register_operand" "0")
+			(match_operand 2 "const_int_operand" "i"))))]
+  "arm_arch_thumb2
+   && UINTVAL (operands[2]) == ~UINTVAL (operands[4])
+   && exact_log2 (UINTVAL (operands[4]) + 1) >= 0"
+  "bfi%?\t%0, %3, %V2"
+  [(set_attr "length" "4")
+   (set_attr "predicable" "yes")
+   (set_attr "type" "bfm")]
+)
+
 (define_insn "andsi_notsi_si"
   [(set (match_operand:SI 0 "s_register_operand" "=r")
 	(and:SI (not:SI (match_operand:SI 2 "s_register_operand" "r"))
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 2b411b0..e5a36d2 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -32,7 +32,7 @@
 
 ;; The following multi-letter normal constraints have been used:
 ;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, DN, Dm, Dl, DL, Do, Dv, Dy, Di,
-;;			 Ds, Dt, Dp, Dz, Tu, Te
+;;			 Dj, Ds, Dt, Dp, Dz, Tu, Te
 ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe
 ;; in Thumb-2 state: Ha, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Rd, Rf, Rb, Ra,
 ;;		     Rg, Ri
@@ -354,6 +354,14 @@
  (and (match_code "const_double,const_int")
       (match_test "TARGET_32BIT && arm_const_double_by_immediates (op)")))
 
+(define_constraint "Dj"
+  "@internal
+   In cores with the v6t2 ISA, a constant with exactly one consecutive
+   string of zero bits."
+  (and (match_code "const_int")
+       (match_test "arm_arch_thumb2
+		    && exact_log2 (~ival + (~ival & -~ival)) >= 0")))
+
 (define_constraint "Dm"
  "@internal
   In ARM/Thumb-2 state a const_vector which can be loaded with a Neon vmov
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 908bedc..c4dec01 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -18,66 +18,73 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_insn "*mve_mov<mode>"
-  [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w,w,r,w,Ux,w")
-	(match_operand:MVE_types 1 "general_operand" "w,r,w,Dn,UxUi,r,Dm,w,Ul"))]
+  [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w   , w,   r,Ux,w")
+	(match_operand:MVE_types 1 "general_operand"      " w,r,w,DnDm,UxUi,r,w, Ul"))]
   "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT"
 {
-  if (which_alternative == 3 || which_alternative == 6)
+  switch (which_alternative)
     {
-      int width, is_valid;
-      static char templ[40];
+    case 0:  /* [w,w].  */
+      return "vmov\t%q0, %q1";
 
-      is_valid = simd_immediate_valid_for_move (operands[1], <MODE>mode,
-	&operands[1], &width);
+    case 1:  /* [w,r].  */
+      return "vmov\t%e0, %Q1, %R1  %@ <mode>\;vmov\t%f0, %J1, %K1";
+
+    case 2:  /* [r,w].  */
+      return "vmov\t%Q0, %R0, %e1  %@ <mode>\;vmov\t%J0, %K0, %f1";
+
+    case 3:  /* [w,DnDm].  */
+      {
+	int width, is_valid;
+
+	is_valid = simd_immediate_valid_for_move (operands[1], <MODE>mode,
+						  &operands[1], &width);
+
+	gcc_assert (is_valid);
+
+	if (width == 0)
+	  return "vmov.f32\t%q0, %1  %@ <mode>";
+	else
+	  {
+	    const int templ_size = 40;
+	    static char templ[templ_size];
+	    if (snprintf (templ, templ_size,
+			  "vmov.i%d\t%%q0, %%x1  %%@ <mode>", width)
+		> templ_size)
+	      abort ();
+	    return templ;
+	  }
+      }
+
+    case 4:  /* [w,UxUi].  */
+      if (<MODE>mode == V2DFmode || <MODE>mode == V2DImode
+	  || <MODE>mode == TImode)
+	return "vldrw.u32\t%q0, %E1";
+      else
+	return "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1";
 
-      gcc_assert (is_valid != 0);
+    case 5:  /* [r,r].  */
+      return output_move_quad (operands);
 
-      if (width == 0)
-	return "vmov.f32\t%q0, %1  @ <mode>";
+    case 6:  /* [Ux,w].  */
+      if (<MODE>mode == V2DFmode || <MODE>mode == V2DImode
+	  || <MODE>mode == TImode)
+	return "vstrw.32\t%q1, %E0";
       else
-	sprintf (templ, "vmov.i%d\t%%q0, %%x1  @ <mode>", width);
-      return templ;
-    }
+	return "vstr<V_sz_elem1>.<V_sz_elem>\t%q1, %E0";
 
-  if (which_alternative == 4 || which_alternative == 7)
-    {
-      if (<MODE>mode == V2DFmode || <MODE>mode == V2DImode || <MODE>mode == TImode)
-	{
-	  if (which_alternative == 7)
-	    output_asm_insn ("vstrw.32\t%q1, %E0", operands);
-	  else
-	    output_asm_insn ("vldrw.u32\t%q0, %E1",operands);
-	}
-      else
-	{
-	  if (which_alternative == 7)
-	    output_asm_insn ("vstr<V_sz_elem1>.<V_sz_elem>\t%q1, %E0", operands);
-	  else
-	    output_asm_insn ("vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1", operands);
-	}
-      return "";
-    }
-  switch (which_alternative)
-    {
-    case 0:
-      return "vmov\t%q0, %q1";
-    case 1:
-      return "vmov\t%e0, %Q1, %R1  @ <mode>\;vmov\t%f0, %J1, %K1";
-    case 2:
-      return "vmov\t%Q0, %R0, %e1  @ <mode>\;vmov\t%J0, %K0, %f1";
-    case 5:
-      return output_move_quad (operands);
-    case 8:
+    case 7:  /* [w,Ul].  */
 	return output_move_neon (operands);
+
     default:
       gcc_unreachable ();
       return "";
     }
 }
-  [(set_attr "type" "mve_move,mve_move,mve_move,mve_move,mve_load,multiple,mve_move,mve_store,mve_load")
-   (set_attr "length" "4,8,8,4,8,8,4,4,4")
-   (set_attr "thumb2_pool_range" "*,*,*,*,1018,*,*,*,*")
-   (set_attr "neg_pool_range" "*,*,*,*,996,*,*,*,*")])
+  [(set_attr "type" "mve_move,mve_move,mve_move,mve_move,mve_load,multiple,mve_store,mve_load")
+   (set_attr "length" "4,8,8,4,4,8,4,8")
+   (set_attr "thumb2_pool_range" "*,*,*,*,1018,*,*,*")
+   (set_attr "neg_pool_range" "*,*,*,*,996,*,*,*")])
 
 (define_insn "*mve_vdup<mode>"
   [(set (match_operand:MVE_vecs 0 "s_register_operand" "=w")
@@ -535,26 +542,6 @@
   [(set_attr "type" "mve_move")
 ])
 
-(define_insn "mve_vec_unpack<US>_lo_<mode>"
-  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
-	(SE:<V_unpack> (vec_select:<V_HALF>
-			  (match_operand:MVE_3 1 "register_operand" "w")
-			  (match_operand:MVE_3 2 "vect_par_constant_low" ""))))]
-  "TARGET_HAVE_MVE"
-  "vmovlb.<US>%#<V_sz_elem> %q0, %q1"
-  [(set_attr "type" "mve_move")]
-)
-
-(define_insn "mve_vec_unpack<US>_hi_<mode>"
-  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
-	(SE:<V_unpack> (vec_select:<V_HALF>
-			  (match_operand:MVE_3 1 "register_operand" "w")
-			  (match_operand:MVE_3 2 "vect_par_constant_high" ""))))]
-  "TARGET_HAVE_MVE"
-  "vmovlt.<US>%#<V_sz_elem> %q0, %q1"
-  [(set_attr "type" "mve_move")]
-)
-
 ;;
 ;; [vcvtpq_s, vcvtpq_u])
 ;;
@@ -2219,23 +2206,10 @@
   [(set_attr "type" "mve_move")
 ])
 
-;; vmovnb pattern used by the vec_pack_trunc expander to avoid the
-;; need for an uninitialized input operand.
-(define_insn "@mve_vec_pack_trunc_lo_<mode>"
-  [
-   (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
-	(unspec:<V_narrow_pack> [(match_operand:MVE_5 1 "s_register_operand" "w")]
-	 VMOVNBQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vmovnb.i%#<V_sz_elem>	%q0, %q1"
-  [(set_attr "type" "mve_move")
-])
-
 ;;
 ;; [vmovntq_s, vmovntq_u])
 ;;
-(define_insn "@mve_vmovntq_<supf><mode>"
+(define_insn "mve_vmovntq_<supf><mode>"
   [
    (set (match_operand:<V_narrow_pack> 0 "s_register_operand" "=w")
 	(unspec:<V_narrow_pack> [(match_operand:<V_narrow_pack> 1 "s_register_operand" "0")
@@ -10495,7 +10469,7 @@
 )
 
 (define_insn "*movmisalign<mode>_mve_store"
-  [(set (match_operand:MVE_VLD_ST 0 "neon_permissive_struct_operand"	     "=Ux")
+  [(set (match_operand:MVE_VLD_ST 0 "mve_memory_operand"	     "=Ux")
 	(unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "s_register_operand" " w")]
 	 UNSPEC_MISALIGNED_ACCESS))]
   "((TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
@@ -10508,7 +10482,7 @@
 
 (define_insn "*movmisalign<mode>_mve_load"
   [(set (match_operand:MVE_VLD_ST 0 "s_register_operand"				 "=w")
-	(unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "neon_permissive_struct_operand" " Ux")]
+	(unspec:MVE_VLD_ST [(match_operand:MVE_VLD_ST 1 "mve_memory_operand" " Ux")]
 	 UNSPEC_MISALIGNED_ACCESS))]
   "((TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode))
     || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode)))
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index f270ded..275bcc1 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -6005,6 +6005,43 @@ if (BYTES_BIG_ENDIAN)
   [(set_attr "type" "neon_shift_imm_long")]
 )
 
+(define_expand "vec_unpack<US>_hi_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
+ "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  {
+   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
+   rtx t1;
+   int i;
+   for (i = 0; i < (<V_mode_nunits>/2); i++)
+     RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
+  
+   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+   emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], 
+                                                 operands[1], 
+					         t1));
+   DONE;
+  }
+)
+
+(define_expand "vec_unpack<US>_lo_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
+ "TARGET_NEON && !BYTES_BIG_ENDIAN"
+  {
+   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
+   rtx t1;
+   int i;
+   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
+     RTVEC_ELT (v, i) = GEN_INT (i);
+   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+   emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], 
+                                                 operands[1], 
+				   	         t1));
+   DONE;
+  }
+)
+
 (define_insn "neon_vec_<US>mult_lo_<mode>"
  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
        (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
@@ -6220,7 +6257,7 @@ if (BYTES_BIG_ENDIAN)
 ; because the ordering of vector elements in Q registers is different from what
 ; the semantics of the instructions require.
 
-(define_insn "neon_quad_vec_pack_trunc_<mode>"
+(define_insn "vec_pack_trunc_<mode>"
  [(set (match_operand:<V_narrow_pack> 0 "register_operand" "=&w")
        (vec_concat:<V_narrow_pack> 
 		(truncate:<V_narrow> 
diff --git a/gcc/config/arm/t-aprofile b/gcc/config/arm/t-aprofile
index 574951c..fe2ec88 100644
--- a/gcc/config/arm/t-aprofile
+++ b/gcc/config/arm/t-aprofile
@@ -26,8 +26,8 @@
 
 # Arch and FPU variants to build libraries with
 
-MULTI_ARCH_OPTS_A       = march=armv7-a/march=armv7-a+fp/march=armv7-a+simd/march=armv7ve+simd/march=armv8-a/march=armv8-a+simd/march=armv9-a/march=armv9-a+simd
-MULTI_ARCH_DIRS_A       = v7-a v7-a+fp v7-a+simd v7ve+simd v8-a v8-a+simd v9-a v9-a+simd
+MULTI_ARCH_OPTS_A       = march=armv7-a/march=armv7-a+fp/march=armv7-a+simd/march=armv7ve+simd/march=armv8-a/march=armv8-a+simd
+MULTI_ARCH_DIRS_A       = v7-a v7-a+fp v7-a+simd v7ve+simd v8-a v8-a+simd
 
 # ARMv7-A - build nofp, fp-d16 and SIMD variants
 
@@ -46,11 +46,6 @@ MULTILIB_REQUIRED	+= mthumb/march=armv8-a/mfloat-abi=soft
 MULTILIB_REQUIRED	+= mthumb/march=armv8-a+simd/mfloat-abi=hard
 MULTILIB_REQUIRED	+= mthumb/march=armv8-a+simd/mfloat-abi=softfp
 
-# Armv9-A - build nofp and SIMD variants.
-MULTILIB_REQUIRED	+= mthumb/march=armv9-a/mfloat-abi=soft
-MULTILIB_REQUIRED	+= mthumb/march=armv9-a+simd/mfloat-abi=hard
-MULTILIB_REQUIRED	+= mthumb/march=armv9-a+simd/mfloat-abi=softfp
-
 # Matches
 
 # Arch Matches
@@ -135,14 +130,12 @@ MULTILIB_MATCHES	+= $(foreach ARCH, $(v8_6_a_simd_variants), \
 			     march?armv8-a+simd=march?armv8.6-a$(ARCH))
 
 # Armv9 without SIMD: map down to base architecture
-MULTILIB_MATCHES	+= $(foreach ARCH, $(v9_a_nosimd_variants), \
-			     march?armv9-a=march?armv9-a$(ARCH))
+MULTILIB_MATCHES    += march?armv8-a=march?armv9-a
+# No variants without SIMD.
 
 # Armv9 with SIMD: map down to base arch + simd
-MULTILIB_MATCHES	+= march?armv9-a+simd=march?armv9-a+crc+simd \
-			   $(foreach ARCH, $(filter-out +simd, $(v9_a_simd_variants)), \
-			     march?armv9-a+simd=march?armv9-a$(ARCH) \
-			     march?armv9-a+simd=march?armv9-a+crc$(ARCH))
+MULTILIB_MATCHES    += $(foreach ARCH, $(v9_a_simd_variants), \
+			     march?armv8-a+simd=march?armv9-a$(ARCH))
 
 # Use Thumb libraries for everything.
 
@@ -150,13 +143,11 @@ MULTILIB_REUSE		+= mthumb/march.armv7-a/mfloat-abi.soft=marm/march.armv7-a/mfloa
 
 MULTILIB_REUSE		+= mthumb/march.armv8-a/mfloat-abi.soft=marm/march.armv8-a/mfloat-abi.soft
 
-MULTILIB_REUSE		+= mthumb/march.armv9-a/mfloat-abi.soft=marm/march.armv9-a/mfloat-abi.soft
-
 MULTILIB_REUSE		+= $(foreach ABI, hard softfp, \
-			     $(foreach ARCH, armv7-a+fp armv7-a+simd armv7ve+simd armv8-a+simd armv9-a+simd, \
+			     $(foreach ARCH, armv7-a+fp armv7-a+simd armv7ve+simd armv8-a+simd, \
 			       mthumb/march.$(ARCH)/mfloat-abi.$(ABI)=marm/march.$(ARCH)/mfloat-abi.$(ABI)))
 
 # Softfp but no FP, use the soft-float libraries.
 MULTILIB_REUSE		+= $(foreach MODE, arm thumb, \
-			     $(foreach ARCH, armv7-a armv8-a armv9-a, \
+			     $(foreach ARCH, armv7-a armv8-a, \
 			       mthumb/march.$(ARCH)/mfloat-abi.soft=m$(MODE)/march.$(ARCH)/mfloat-abi.softfp))
diff --git a/gcc/config/arm/t-multilib b/gcc/config/arm/t-multilib
index ea258b1..6bb58d3 100644
--- a/gcc/config/arm/t-multilib
+++ b/gcc/config/arm/t-multilib
@@ -78,7 +78,6 @@ v8_4_a_simd_variants	:= $(call all_feat_combs, simd fp16 crypto i8mm bf16)
 v8_5_a_simd_variants	:= $(call all_feat_combs, simd fp16 crypto i8mm bf16)
 v8_6_a_simd_variants	:= $(call all_feat_combs, simd fp16 crypto i8mm bf16)
 v8_r_nosimd_variants	:= +crc
-v9_a_nosimd_variants	:= +crc
 v9_a_simd_variants	:= $(call all_feat_combs, simd fp16 crypto i8mm bf16)
 
 ifneq (,$(HAS_APROFILE))
@@ -206,14 +205,10 @@ MULTILIB_MATCHES	+= $(foreach ARCH, $(v8_6_a_simd_variants), \
 
 # Armv9
 MULTILIB_MATCHES	+= march?armv7=march?armv9-a
-MULTILIB_MATCHES	+= $(foreach ARCH, $(v9_a_nosimd_variants), \
-			     march?armv7=march?armv9-a$(ARCH))
 
 # Armv9 with SIMD
-MULTILIB_MATCHES	+= march?armv7+fp=march?armv9-a+crc+simd \
-			   $(foreach ARCH, $(v9_a_simd_variants), \
-			     march?armv7+fp=march?armv9-a$(ARCH) \
-			     march?armv7+fp=march?armv9-a+crc$(ARCH))
+MULTILIB_MATCHES	+= $(foreach ARCH, $(v9_a_simd_variants), \
+			     march?armv7+fp=march?armv9-a$(ARCH))
 endif		# Not APROFILE.
 
 # Use Thumb libraries for everything.
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index f130090..1fd68f3 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -280,29 +280,81 @@
   DONE;
 })
 
-(define_expand "movmisalign<mode>"
- [(set (match_operand:VDQ 0 "neon_perm_struct_or_reg_operand")
-	(unspec:VDQ [(match_operand:VDQ 1 "neon_perm_struct_or_reg_operand")]
+(define_expand "@movmisalign<mode>"
+ [(set (match_operand:VDQ 0 "nonimmediate_operand")
+	(unspec:VDQ [(match_operand:VDQ 1 "general_operand")]
 	 UNSPEC_MISALIGNED_ACCESS))]
  "ARM_HAVE_<MODE>_LDST && !BYTES_BIG_ENDIAN
   && unaligned_access && !TARGET_REALLY_IWMMXT"
 {
- rtx adjust_mem;
- /* This pattern is not permitted to fail during expansion: if both arguments
-    are non-registers (e.g. memory := constant, which can be created by the
-    auto-vectorizer), force operand 1 into a register.  */
- if (!s_register_operand (operands[0], <MODE>mode)
-     && !s_register_operand (operands[1], <MODE>mode))
-   operands[1] = force_reg (<MODE>mode, operands[1]);
-
- if (s_register_operand (operands[0], <MODE>mode))
-   adjust_mem = operands[1];
- else
-   adjust_mem = operands[0];
-
- /* Legitimize address.  */
- if (!neon_vector_mem_operand (adjust_mem, 2, true))
-   XEXP (adjust_mem, 0) = force_reg (Pmode, XEXP (adjust_mem, 0));
+  rtx *memloc;
+  bool for_store = false;
+  /* This pattern is not permitted to fail during expansion: if both arguments
+     are non-registers (e.g. memory := constant, which can be created by the
+     auto-vectorizer), force operand 1 into a register.  */
+  if (!s_register_operand (operands[0], <MODE>mode)
+      && !s_register_operand (operands[1], <MODE>mode))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+
+  if (s_register_operand (operands[0], <MODE>mode))
+    memloc = &operands[1];
+  else
+    {
+      memloc = &operands[0];
+      for_store = true;
+    }
+
+  /* For MVE, vector loads/stores must be aligned to the element size.  If the
+     alignment is less than that convert the load/store to a suitable mode.  */
+  if (TARGET_HAVE_MVE
+      && (MEM_ALIGN (*memloc)
+	  < GET_MODE_ALIGNMENT (GET_MODE_INNER (<MODE>mode))))
+    {
+      scalar_mode new_smode;
+      switch (MEM_ALIGN (*memloc))
+	{
+	case 64:
+	case 32:
+	  new_smode = SImode;
+	  break;
+	case 16:
+	  new_smode = HImode;
+	  break;
+	default:
+	  new_smode = QImode;
+	  break;
+	}
+      machine_mode new_mode
+	= mode_for_vector (new_smode,
+			   GET_MODE_SIZE (<MODE>mode)
+			   / GET_MODE_SIZE (new_smode)).require ();
+      rtx new_mem = adjust_address (*memloc, new_mode, 0);
+
+      if (!for_store)
+	{
+	  rtx reg = gen_reg_rtx (new_mode);
+	  emit_insn (gen_movmisalign (new_mode, reg, new_mem));
+	  emit_move_insn (operands[0], gen_lowpart (<MODE>mode, reg));
+	  DONE;
+	}
+      emit_insn (gen_movmisalign (new_mode, new_mem,
+				  gen_lowpart (new_mode, operands[1])));
+      DONE;
+    }
+
+  /* Legitimize address.  */
+  if ((TARGET_HAVE_MVE
+       && !mve_vector_mem_operand (<MODE>mode, XEXP (*memloc, 0), false))
+      || (!TARGET_HAVE_MVE
+	  && !neon_vector_mem_operand (*memloc, 2, false)))
+    {
+      rtx new_mem
+	= replace_equiv_address (*memloc,
+				 force_reg (Pmode, XEXP (*memloc, 0)),
+				 false);
+      gcc_assert (MEM_ALIGN (new_mem) == MEM_ALIGN (*memloc));
+      *memloc = new_mem;
+    }
 })
 
 (define_insn "mve_vshlq_<supf><mode>"
@@ -580,77 +632,6 @@
   "ARM_HAVE_<MODE>_ARITH
    && !TARGET_REALLY_IWMMXT"
 )
-
-;; vmovl[tb] are not available for V4SI on MVE
-(define_expand "vec_unpack<US>_hi_<mode>"
-  [(set (match_operand:<V_unpack> 0 "register_operand")
-	(SE:<V_unpack> (vec_select:<V_HALF>
-			 (match_operand:VU 1 "register_operand")
-			 (match_dup 2))))]
- "ARM_HAVE_<MODE>_ARITH
-  && !TARGET_REALLY_IWMMXT
-  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
-  && !BYTES_BIG_ENDIAN"
-  {
-    rtvec v = rtvec_alloc (<V_mode_nunits>/2);
-    int i;
-    for (i = 0; i < (<V_mode_nunits>/2); i++)
-      RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
-
-    operands[2] = gen_rtx_PARALLEL (<MODE>mode, v);
-  }
-)
-
-;; vmovl[tb] are not available for V4SI on MVE
-(define_expand "vec_unpack<US>_lo_<mode>"
-  [(set (match_operand:<V_unpack> 0 "register_operand")
-	(SE:<V_unpack> (vec_select:<V_HALF>
-			 (match_operand:VU 1 "register_operand")
-			 (match_dup 2))))]
- "ARM_HAVE_<MODE>_ARITH
-  && !TARGET_REALLY_IWMMXT
-  && ! (<MODE>mode == V4SImode && TARGET_HAVE_MVE)
-  && !BYTES_BIG_ENDIAN"
-  {
-    rtvec v = rtvec_alloc (<V_mode_nunits>/2);
-    int i;
-    for (i = 0; i < (<V_mode_nunits>/2) ; i++)
-      RTVEC_ELT (v, i) = GEN_INT (i);
-
-    operands[2] = gen_rtx_PARALLEL (<MODE>mode, v);
-
-  }
-)
-
-;; vmovn[tb] are not available for V2DI on MVE
-(define_expand "vec_pack_trunc_<mode>"
- [(set (match_operand:<V_narrow_pack> 0 "register_operand")
-       (vec_concat:<V_narrow_pack>
-		(truncate:<V_narrow>
-			(match_operand:VN 1 "register_operand"))
-		(truncate:<V_narrow>
-			(match_operand:VN 2 "register_operand"))))]
- "ARM_HAVE_<MODE>_ARITH
-  && !TARGET_REALLY_IWMMXT
-  && ! (<MODE>mode == V2DImode && TARGET_HAVE_MVE)
-  && !BYTES_BIG_ENDIAN"
- {
-   if (TARGET_NEON)
-     {
-       emit_insn (gen_neon_quad_vec_pack_trunc_<mode> (operands[0], operands[1],
-						       operands[2]));
-     }
-   else
-     {
-       rtx tmpreg = gen_reg_rtx (<V_narrow_pack>mode);
-       emit_insn (gen_mve_vec_pack_trunc_lo (<MODE>mode, tmpreg, operands[1]));
-       emit_insn (gen_mve_vmovntq (VMOVNTQ_S, <MODE>mode,
-				   operands[0], tmpreg, operands[2]));
-     }
-   DONE;
- }
-)
-
 (define_expand "vec_init<mode><V_elem_l>"
   [(match_operand:VDQX 0 "s_register_operand")
    (match_operand 1 "" "")]
diff --git a/gcc/config/avr/avr-mcus.def b/gcc/config/avr/avr-mcus.def
index 1e12ab3..fa5e668 100644
--- a/gcc/config/avr/avr-mcus.def
+++ b/gcc/config/avr/avr-mcus.def
@@ -306,6 +306,14 @@ AVR_MCU ("atxmega16c4",      ARCH_AVRXMEGA2, AVR_ISA_RMW,  "__AVR_ATxmega16C4__"
 AVR_MCU ("atxmega32a4u",     ARCH_AVRXMEGA2, AVR_ISA_RMW,  "__AVR_ATxmega32A4U__", 0x2000, 0x0, 0x9000, 0)
 AVR_MCU ("atxmega32c4",      ARCH_AVRXMEGA2, AVR_ISA_RMW,  "__AVR_ATxmega32C4__",  0x2000, 0x0, 0x9000, 0)
 AVR_MCU ("atxmega32e5",      ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega32E5__",  0x2000, 0x0, 0x9000, 0)
+AVR_MCU ("avr64da28",        ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DA28__",    0x6000, 0x0, 0x8000, 0x10000)
+AVR_MCU ("avr64da32",        ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DA32__",    0x6000, 0x0, 0x8000, 0x10000)
+AVR_MCU ("avr64da48",        ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DA48__",    0x6000, 0x0, 0x8000, 0x10000)
+AVR_MCU ("avr64da64",        ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DA64__",    0x6000, 0x0, 0x8000, 0x10000)
+AVR_MCU ("avr64db28",        ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DB28__",    0x6000, 0x0, 0x8000, 0x10000)
+AVR_MCU ("avr64db32",        ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DB32__",    0x6000, 0x0, 0x8000, 0x10000)
+AVR_MCU ("avr64db48",        ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DB48__",    0x6000, 0x0, 0x8000, 0x10000)
+AVR_MCU ("avr64db64",        ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_AVR64DB64__",    0x6000, 0x0, 0x8000, 0x10000)
 /* Xmega, Flash + RAM < 64K, flash visible in RAM address space */
 AVR_MCU ("avrxmega3",        ARCH_AVRXMEGA3, AVR_ISA_NONE,  NULL,                  0x3f00, 0x0, 0x8000, 0)
 AVR_MCU ("attiny202",        ARCH_AVRXMEGA3, AVR_ISA_RCALL, "__AVR_ATtiny202__",   0x3f80, 0x0, 0x800,  0x8000)
@@ -342,6 +350,12 @@ AVR_MCU ("atmega3208",       ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATmega3208__"
 AVR_MCU ("atmega3209",       ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATmega3209__",  0x3800, 0x0, 0x8000, 0x4000)
 AVR_MCU ("atmega4808",       ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATmega4808__",  0x2800, 0x0, 0xc000, 0x4000)
 AVR_MCU ("atmega4809",       ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_ATmega4809__",  0x2800, 0x0, 0xc000, 0x4000)
+AVR_MCU ("avr32da28",        ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_AVR32DA28__",   0x7000, 0x0, 0x8000, 0x8000)
+AVR_MCU ("avr32da32",        ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_AVR32DA32__",   0x7000, 0x0, 0x8000, 0x8000)
+AVR_MCU ("avr32da48",        ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_AVR32DA48__",   0x7000, 0x0, 0x8000, 0x8000)
+AVR_MCU ("avr32db28",        ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_AVR32DB28__",   0x7000, 0x0, 0x8000, 0x8000)
+AVR_MCU ("avr32db32",        ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_AVR32DB32__",   0x7000, 0x0, 0x8000, 0x8000)
+AVR_MCU ("avr32db48",        ARCH_AVRXMEGA3, AVR_ISA_NONE,  "__AVR_AVR32DB48__",   0x7000, 0x0, 0x8000, 0x8000)
 /* Xmega, 64K < Flash <= 128K, RAM <= 64K */
 AVR_MCU ("avrxmega4",        ARCH_AVRXMEGA4, AVR_ISA_NONE, NULL,                   0x2000, 0x0, 0x11000, 0)
 AVR_MCU ("atxmega64a3",      ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_ATxmega64A3__",  0x2000, 0x0, 0x11000, 0)
@@ -352,6 +366,14 @@ AVR_MCU ("atxmega64b1",      ARCH_AVRXMEGA4, AVR_ISA_RMW,  "__AVR_ATxmega64B1__"
 AVR_MCU ("atxmega64b3",      ARCH_AVRXMEGA4, AVR_ISA_RMW,  "__AVR_ATxmega64B3__",  0x2000, 0x0, 0x11000, 0)
 AVR_MCU ("atxmega64c3",      ARCH_AVRXMEGA4, AVR_ISA_RMW,  "__AVR_ATxmega64C3__",  0x2000, 0x0, 0x11000, 0)
 AVR_MCU ("atxmega64d4",      ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_ATxmega64D4__",  0x2000, 0x0, 0x11000, 0)
+AVR_MCU ("avr128da28",       ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_AVR128DA28__",   0x4000, 0x0, 0x8000,  0x20000)
+AVR_MCU ("avr128da32",       ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_AVR128DA32__",   0x4000, 0x0, 0x8000,  0x20000)
+AVR_MCU ("avr128da48",       ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_AVR128DA48__",   0x4000, 0x0, 0x8000,  0x20000)
+AVR_MCU ("avr128da64",       ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_AVR128DA64__",   0x4000, 0x0, 0x8000,  0x20000)
+AVR_MCU ("avr128db28",       ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_AVR128DB28__",   0x4000, 0x0, 0x8000,  0x20000)
+AVR_MCU ("avr128db32",       ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_AVR128DB32__",   0x4000, 0x0, 0x8000,  0x20000)
+AVR_MCU ("avr128db48",       ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_AVR128DB48__",   0x4000, 0x0, 0x8000,  0x20000)
+AVR_MCU ("avr128db64",       ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_AVR128DB64__",   0x4000, 0x0, 0x8000,  0x20000)
 /* Xmega, 64K < Flash <= 128K, RAM > 64K */
 AVR_MCU ("avrxmega5",        ARCH_AVRXMEGA5, AVR_ISA_NONE, NULL,                   0x2000, 0x0, 0x11000, 0)
 AVR_MCU ("atxmega64a1",      ARCH_AVRXMEGA5, AVR_ISA_NONE, "__AVR_ATxmega64A1__",  0x2000, 0x0, 0x11000, 0)
diff --git a/gcc/config/avr/gen-avr-mmcu-specs.cc b/gcc/config/avr/gen-avr-mmcu-specs.cc
index bf9aa2c..1b75e05 100644
--- a/gcc/config/avr/gen-avr-mmcu-specs.cc
+++ b/gcc/config/avr/gen-avr-mmcu-specs.cc
@@ -279,7 +279,7 @@ print_mcu (const avr_mcu_t *mcu)
   if (is_device)
     {
       fprintf (f, "*self_spec:\n");
-      fprintf (f, "\t%%{!mmcu=avr*: %%<mmcu=* -mmcu=%s} ", arch->name);
+      fprintf (f, "\t%%<mmcu=* -mmcu=%s ", arch->name);
       fprintf (f, "%s ", rcall_spec);
       fprintf (f, "%s\n\n", sp8_spec);
 
diff --git a/gcc/config/avr/gen-avr-mmcu-texi.cc b/gcc/config/avr/gen-avr-mmcu-texi.cc
index d9c3a30..0e013e9 100644
--- a/gcc/config/avr/gen-avr-mmcu-texi.cc
+++ b/gcc/config/avr/gen-avr-mmcu-texi.cc
@@ -23,10 +23,12 @@
 
 #define IN_GEN_AVR_MMCU_TEXI
 
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+
 #include "avr-devices.cc"
 
 static const avr_mcu_t*
-mcus[sizeof avr_mcu_types / sizeof avr_mcu_types[0]];
+mcus[ARRAY_SIZE (avr_mcu_types)];
 
 static int letter (char c)
 {
@@ -53,10 +55,10 @@ c_prefix (const char *str)
 {
   static const char *const prefixes[] =
     {
-      "attiny", "atmega", "atxmega", "ata", "at90"
+      "attiny", "atmega", "atxmega", "ata", "at90", "avr"
     };
 
-  int i, n = (int) (sizeof (prefixes) / sizeof (*prefixes));
+  int i, n = (int) (ARRAY_SIZE (prefixes));
 
   for (i = 0; i < n; i++)
     if (str_prefix_p (str, prefixes[i]))
@@ -185,7 +187,7 @@ int main (void)
 	  print_mcus (n_mcus);
 	  n_mcus = 0;
 
-	  for (i = 0; i < sizeof (avr_texinfo) / sizeof (*avr_texinfo); i++)
+	  for (i = 0; i < ARRAY_SIZE (avr_texinfo); i++)
 	    if (arch_id == avr_texinfo[i].arch_id)
 	      printf ("@item %s\n%s\n", mcu->name, avr_texinfo[i].texinfo);
 	}
diff --git a/gcc/config/bfin/bfin.md b/gcc/config/bfin/bfin.md
index 0e44653..56b2472 100644
--- a/gcc/config/bfin/bfin.md
+++ b/gcc/config/bfin/bfin.md
@@ -1741,7 +1741,7 @@
 	(ior:SI (ashift:SI (match_operand:SI 1 "register_operand" "d") (const_int 1))
 		(zero_extend:SI (reg:BI REG_CC))))
    (set (reg:BI REG_CC)
-	(zero_extract:BI (match_dup 1) (const_int 31) (const_int 0)))]
+	(zero_extract:BI (match_dup 1) (const_int 1) (const_int 31)))]
   ""
   "%0 = ROT %1 BY 1%!"
   [(set_attr "type" "dsp32shiftimm")])
diff --git a/gcc/config/c6x/c6x.cc b/gcc/config/c6x/c6x.cc
index 7fe18d6..dc01a6e 100644
--- a/gcc/config/c6x/c6x.cc
+++ b/gcc/config/c6x/c6x.cc
@@ -2580,7 +2580,7 @@ static unsigned reg_save_order[] =
   REG_B14, REG_A15
 };
 
-#define N_SAVE_ORDER (sizeof reg_save_order / sizeof *reg_save_order)
+#define N_SAVE_ORDER (ARRAY_SIZE (reg_save_order))
 
 /* Compute the layout of the stack frame and store it in FRAME.  */
 
diff --git a/gcc/config/darwin-c.cc b/gcc/config/darwin-c.cc
index 3770857..1f82b36 100644
--- a/gcc/config/darwin-c.cc
+++ b/gcc/config/darwin-c.cc
@@ -505,7 +505,7 @@ darwin_register_frameworks (const char *sysroot,
       size_t i;
 
       /* Setup default search path for frameworks.  */
-      for (i=0; i<sizeof (framework_defaults)/sizeof(const char *); ++i)
+      for (i = 0; i < ARRAY_SIZE (framework_defaults); ++i)
 	{
 	  char *str;
 	  if (sysroot)
@@ -691,7 +691,8 @@ macosx_version_as_macro (void)
   if (!version_array)
     goto fail;
 
-  if (version_array[MAJOR] < 10 || version_array[MAJOR] > 12)
+  /* System tools accept up to 99 as a major version.  */
+  if (version_array[MAJOR] < 10 || version_array[MAJOR] > 99)
     goto fail;
 
   if (version_array[MAJOR] == 10 && version_array[MINOR] < 10)
diff --git a/gcc/config/darwin-driver.cc b/gcc/config/darwin-driver.cc
index 30e0e64..00287f3 100644
--- a/gcc/config/darwin-driver.cc
+++ b/gcc/config/darwin-driver.cc
@@ -160,19 +160,13 @@ darwin_find_version_from_kernel (void)
     goto parse_failed;
 
   /* Darwin20 sees a transition to macOS 11.  In this, it seems that the
-     mapping to macOS minor version is now shifted to the kernel minor
-     version - 1 (at least for the initial releases).  */
+     mapping to macOS minor version and patch level is now always 0, 0
+     (at least for macOS 11 and 12).  */
   if (major_vers >= 20)
     {
-      int minor_vers = *version_p++ - '0';
-      if (ISDIGIT (*version_p))
-	minor_vers = minor_vers * 10 + (*version_p++ - '0');
-      if (*version_p++ != '.')
-	goto parse_failed;
-      if (minor_vers > 0)
-	minor_vers -= 1; /* Kernel 20.3 => macOS 11.2.  */
-      /* It's not yet clear whether patch level will be considered.  */
-      asprintf (&new_flag, "%d.%02d.00", major_vers - 9, minor_vers);
+      /* Apple clang doesn't include the minor version or the patch level
+	 in the object file, nor does it pass it to ld  */
+      asprintf (&new_flag, "%d.00.00", major_vers - 9);
     }
   else if (major_vers - 4 <= 4)
     /* On 10.4 and earlier, the old linker is used which does not
diff --git a/gcc/config/darwin.cc b/gcc/config/darwin.cc
index f065a13..1b3de33 100644
--- a/gcc/config/darwin.cc
+++ b/gcc/config/darwin.cc
@@ -1936,6 +1936,8 @@ darwin_label_is_anonymous_local_objc_name (const char *name)
     }
   else if (startswith ((const char *)p, "ClassMethods"))
     return false;
+  else if (startswith ((const char *)p, "ClassProtocols"))
+    return false;
   else if (startswith ((const char *)p, "Instance"))
     {
       if (p[8] == 'I' || p[8] == 'M')
diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h
index 3682bd2..f82ec62 100644
--- a/gcc/config/darwin.h
+++ b/gcc/config/darwin.h
@@ -143,10 +143,7 @@ extern GTY(()) int darwin_ms_struct;
    Right now there's no mechanism to split up the "variable portion" (%*) of
    the matched spec string, so where we have some driver specs that take 2
    or 3 arguments, these cannot be processed here, but are deferred until the
-   LINK_SPEC, where they are copied verbatim.
-   We have a "safe" version of the MacOS version string, that's been sanity-
-   checked and truncated to minor version.  If the 'tiny' (3rd) portion of the
-   value is not significant, it's better to use this in version-compare().  */
+   LINK_SPEC, where they are copied verbatim.  */
 
 #undef SUBTARGET_DRIVER_SELF_SPECS
 #define SUBTARGET_DRIVER_SELF_SPECS					\
@@ -220,13 +217,8 @@ extern GTY(()) int darwin_ms_struct;
   "%{image_base*:-Xlinker -image_base -Xlinker %*} %<image_base*",	\
   "%{init*:-Xlinker -init -Xlinker %*} %<init*",			\
   "%{multi_module:-Xlinker -multi_module} %<multi_module",		\
-  "%{multiply_defined*:-Xlinker -multiply_defined -Xlinker %*; \
-     :%{shared-libgcc: \
-       %:version-compare(< 10.5 asm_macosx_version_min= -Xlinker) \
-       %:version-compare(< 10.5 asm_macosx_version_min= -multiply_defined) \
-       %:version-compare(< 10.5 asm_macosx_version_min= -Xlinker) \
-       %:version-compare(< 10.5 asm_macosx_version_min= suppress)}} \
-     %<multiply_defined*",						\
+  "%{multiply_defined*:-Xlinker -multiply_defined -Xlinker %*} \
+     %<multiply_defined* ",						\
   "%{multiplydefinedunused*:\
      -Xlinker -multiply_defined_unused -Xlinker %*} \
      %<multiplydefinedunused* ",					\
@@ -458,6 +450,9 @@ extern GTY(()) int darwin_ms_struct;
    %{!force_cpusubtype_ALL:-arch %(darwin_subarch)} "\
    LINK_SYSROOT_SPEC \
   "%{mmacosx-version-min=*:-macosx_version_min %*} \
+   %{!multiply_defined*:%{shared-libgcc: \
+     %:version-compare(< 10.5 mmacosx-version-min= -multiply_defined) \
+     %:version-compare(< 10.5 mmacosx-version-min= suppress) }} \
    %{sectalign*} %{sectcreate*} %{sectobjectsymbols*}  %{sectorder*} \
    %{segaddr*} %{segcreate*} %{segprot*} "
 
@@ -465,48 +460,36 @@ extern GTY(()) int darwin_ms_struct;
 
 #define LIB_SPEC "%{!static:-lSystem}"
 
-/*
-   Note that by default, -lgcc_eh is not linked against.
-   This is because,in general, we need to unwind through system libraries that
-   are linked with the shared unwinder in libunwind (or libgcc_s for 10.4/5).
+/* Note that by default, -lgcc_eh (which provides a statically-linked unwinder)
+   is not used. This is because, in general, we need to unwind through system
+   libraries that are linked with the shared unwinder in libunwind (or libgcc_s
+   for OSX 10.4/5 [darwin8/9]).
 
-   For -static-libgcc: < 10.6, use the unwinder in libgcc_eh (and find
-   the emultls impl. there too).
+   When -static-libgcc is forced: < 10.6, use the unwinder in libgcc_eh (and
+   find the emultls impl. there too).
 
    For -static-libgcc: >= 10.6, the unwinder *still* comes from libSystem and
    we find the emutls impl from lemutls_w. In either case, the builtins etc.
-   are linked from -lgcc.
+   are linked from -lgcc.  The eh library is still available so that it could
+   be specified explicitly if there is some reason to do so.
 
    When we have specified shared-libgcc or any case that might require
    exceptions, we pull the libgcc content (including emulated tls) from
-   -lgcc_s.1 in GCC and the unwinder from /usr/lib/libgcc_s.1 for < 10.6 and
+   -lgcc_s.1.1 in GCC and the unwinder from /usr/lib/libgcc_s.1 for < 10.6 and
    libSystem for >= 10.6 respectively.
    Otherwise, we just link the emutls/builtins from convenience libs.
 
-   If we need exceptions, prior to 10.3.9, then we have to link the static
-   eh lib, since there's no shared version on the system.
-
-   In all cases, libgcc_s.1 will be installed with the compiler, or any app
-   built using it, so we can link the builtins and emutls shared on all.
-
    We have to work around that DYLD_XXXX are disabled in macOS 10.11+ which
    means that any bootstrap trying to use a shared libgcc with a bumped SO-
    name will fail.  This means that we do not accept shared libgcc for these
-   versions.
+   versions (the primary reason for forcing a shared libgcc was that it
+   contained the unwinder on Darwin8 and 9).
 
-   For -static-libgcc: >= 10.6, the unwinder *still* comes from libSystem and
-   we find the emutls impl from lemutls_w. In either case, the builtins etc.
-   are linked from -lgcc.
->
-   Otherwise, we just link the shared version of gcc_s.1.1 and pick up
-   exceptions:
+   When using the shared version of gcc_s.1.1 the unwinder is provided by:
      * Prior to 10.3.9, then we have to link the static eh lib, since there
-       is no shared version on the system.
+       is no shared unwinder version on the system.
      * from 10.3.9 to 10.5, from /usr/lib/libgcc_s.1.dylib
      * from 10.6 onwards, from libSystem.dylib
-
-   In all cases, libgcc_s.1.1 will be installed with the compiler, or any app
-   built using it, so we can link the builtins and emutls shared on all.
 */
 #undef REAL_LIBGCC_SPEC
 #define REAL_LIBGCC_SPEC \
diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h
index 28ebcad..d89ee7d 100644
--- a/gcc/config/freebsd.h
+++ b/gcc/config/freebsd.h
@@ -55,7 +55,7 @@ along with GCC; see the file COPYING3.  If not see
 #endif
 
 #undef TARGET_LIBC_HAS_FUNCTION
-#define TARGET_LIBC_HAS_FUNCTION no_c99_libc_has_function
+#define TARGET_LIBC_HAS_FUNCTION bsd_libc_has_function
 
 /* Use --as-needed -lgcc_s for eh support.  */
 #ifdef HAVE_LD_AS_NEEDED
diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h
index 9b5fee9..b3079ce 100644
--- a/gcc/config/gcn/gcn-hsa.h
+++ b/gcc/config/gcn/gcn-hsa.h
@@ -75,68 +75,19 @@ extern unsigned int gcn_local_sym_hash (const char *name);
    supported for gcn.  */
 #define GOMP_SELF_SPECS ""
 
-#ifdef HAVE_GCN_XNACK_FIJI
-#define X_FIJI
-#else
-#define X_FIJI "!march=*:;march=fiji:;"
-#endif
-#ifdef HAVE_GCN_XNACK_GFX900
-#define X_900
-#else
-#define X_900 "march=gfx900:;"
-#endif
-#ifdef HAVE_GCN_XNACK_GFX906
-#define X_906
-#else
-#define X_906 "march=gfx906:;"
-#endif
-#ifdef HAVE_GCN_XNACK_GFX908
-#define X_908
-#else
-#define X_908 "march=gfx908:;"
-#endif
-
-/* These targets can't have SRAM-ECC, even if a broken assembler allows it.  */
-#define S_FIJI "!march=*:;march=fiji:;"
-#define S_900 "march=gfx900:;"
-#define S_906 "march=gfx906:;"
-#ifdef HAVE_GCN_SRAM_ECC_GFX908
-#define S_908
-#else
-#define S_908 "march=gfx908:;"
-#endif
+#define NO_XNACK "!march=*:;march=fiji:;"
+#define NO_SRAM_ECC "!march=*:;march=fiji:;march=gfx900:;march=gfx906:;"
 
-#ifdef HAVE_GCN_ASM_V3_SYNTAX
-#define SRAMOPT "!msram-ecc=off:-mattr=+sram-ecc;:-mattr=-sram-ecc"
-#endif
-#ifdef HAVE_GCN_ASM_V4_SYNTAX
 /* In HSACOv4 no attribute setting means the binary supports "any" hardware
    configuration.  The name of the attribute also changed.  */
 #define SRAMOPT "msram-ecc=on:-mattr=+sramecc;msram-ecc=off:-mattr=-sramecc"
-#endif
-#if !defined(SRAMOPT) && !defined(IN_LIBGCC2)
-#error "No assembler syntax configured"
-#endif
-
-#ifdef HAVE_GCN_ASM_V4_SYNTAX
-/* FIJI cards don't seem to support drivers new enough to allow HSACOv4.  */
-#define HSACO3_SELECT_OPT \
-    "%{!march=*|march=fiji:--amdhsa-code-object-version=3} "
-#else
-#define HSACO3_SELECT_OPT
-#endif
-
-/* These targets can't have SRAM-ECC, even if a broken assembler allows it.  */
-#define DRIVER_SELF_SPECS \
-  "%{march=fiji|march=gfx900|march=gfx906:%{!msram-ecc=*:-msram-ecc=off}}"
 
 /* Use LLVM assembler and linker options.  */
 #define ASM_SPEC  "-triple=amdgcn--amdhsa "  \
 		  "%:last_arg(%{march=*:-mcpu=%*}) " \
-		  HSACO3_SELECT_OPT \
-		  "%{" X_FIJI X_900 X_906 X_908 \
-		     "mxnack:-mattr=+xnack;:-mattr=-xnack} " \
-		  "%{" S_FIJI S_900 S_906 S_908 SRAMOPT "} " \
+		  "%{!march=*|march=fiji:--amdhsa-code-object-version=3} " \
+		  "%{" NO_XNACK "mxnack:-mattr=+xnack;:-mattr=-xnack} " \
+		  "%{" NO_SRAM_ECC SRAMOPT "} " \
 		  "-filetype=obj"
 #define LINK_SPEC "--pie --export-dynamic"
 #define LIB_SPEC  "-lc"
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index c080524..b62dfb4 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -23,16 +23,30 @@ enum processor_type
   PROCESSOR_FIJI,    // gfx803
   PROCESSOR_VEGA10,  // gfx900
   PROCESSOR_VEGA20,  // gfx906
-  PROCESSOR_GFX908   // as yet unnamed
+  PROCESSOR_GFX908,
+  PROCESSOR_GFX90a
 };
 
 /* Set in gcn_option_override.  */
-extern int gcn_isa;
-
-#define TARGET_GCN3 (gcn_isa == 3)
-#define TARGET_GCN3_PLUS (gcn_isa >= 3)
-#define TARGET_GCN5 (gcn_isa == 5)
-#define TARGET_GCN5_PLUS (gcn_isa >= 5)
+extern enum gcn_isa {
+  ISA_UNKNOWN,
+  ISA_GCN3,
+  ISA_GCN5,
+  ISA_CDNA1,
+  ISA_CDNA2
+} gcn_isa;
+
+#define TARGET_GCN3 (gcn_isa == ISA_GCN3)
+#define TARGET_GCN3_PLUS (gcn_isa >= ISA_GCN3)
+#define TARGET_GCN5 (gcn_isa == ISA_GCN5)
+#define TARGET_GCN5_PLUS (gcn_isa >= ISA_GCN5)
+#define TARGET_CDNA1 (gcn_isa == ISA_CDNA1)
+#define TARGET_CDNA1_PLUS (gcn_isa >= ISA_CDNA1)
+#define TARGET_CDNA2 (gcn_isa == ISA_CDNA2)
+#define TARGET_CDNA2_PLUS (gcn_isa >= ISA_CDNA2)
+
+#define TARGET_M0_LDS_LIMIT (TARGET_GCN3)
+#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS)
 
 enum sram_ecc_type
 {
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 9f868a1..abe4620 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -823,17 +823,8 @@
 
     static char buf[200];
     if (AS_GLOBAL_P (as))
-      {
-	/* Work around assembler bug in which a 64-bit register is expected,
-	but a 32-bit value would be correct.  */
-	int reg = REGNO (operands[2]) - FIRST_VGPR_REG;
-	if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
-	  sprintf (buf, "global_load%%o0\t%%0, v%d, %%1 offset:%%3%s\;"
-			"s_waitcnt\tvmcnt(0)", reg, glc);
-	else
-	  sprintf (buf, "global_load%%o0\t%%0, v[%d:%d], %%1 offset:%%3%s\;"
-			"s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
-      }
+      sprintf (buf, "global_load%%o0\t%%0, %%2, %%1 offset:%%3%s\;"
+	       "s_waitcnt\tvmcnt(0)", glc);
     else
       gcc_unreachable ();
       
@@ -958,17 +949,7 @@
 
     static char buf[200];
     if (AS_GLOBAL_P (as))
-      {
-	/* Work around assembler bug in which a 64-bit register is expected,
-	but a 32-bit value would be correct.  */
-	int reg = REGNO (operands[1]) - FIRST_VGPR_REG;
-	if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
-	  sprintf (buf, "global_store%%s3\tv%d, %%3, %%0 offset:%%2%s",
-		   reg, glc);
-	else
-	  sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s",
-		   reg, reg + 1, glc);
-      }
+      sprintf (buf, "global_store%%s3\t%%1, %%3, %%0 offset:%%2%s", glc);
     else
       gcc_unreachable ();
 
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 402f025..6fc20d3 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -66,7 +66,7 @@ static bool ext_gcn_constants_init = 0;
 
 /* Holds the ISA variant, derived from the command line parameters.  */
 
-int gcn_isa = 3;		/* Default to GCN3.  */
+enum gcn_isa gcn_isa = ISA_GCN3;	/* Default to GCN3.  */
 
 /* Reserve this much space for LDS (for propagating variables from
    worker-single mode to worker-partitioned mode), per workgroup.  Global
@@ -129,7 +129,13 @@ gcn_option_override (void)
   if (!flag_pic)
     flag_pic = flag_pie;
 
-  gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;
+  gcn_isa = (gcn_arch == PROCESSOR_FIJI ? ISA_GCN3
+      : gcn_arch == PROCESSOR_VEGA10 ? ISA_GCN5
+      : gcn_arch == PROCESSOR_VEGA20 ? ISA_GCN5
+      : gcn_arch == PROCESSOR_GFX908 ? ISA_CDNA1
+      : gcn_arch == PROCESSOR_GFX90a ? ISA_CDNA2
+      : ISA_UNKNOWN);
+  gcc_assert (gcn_isa != ISA_UNKNOWN);
 
   /* The default stack size needs to be small for offload kernels because
      there may be many, many threads.  Also, a smaller stack gives a
@@ -2632,7 +2638,7 @@ gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
     case omp_device_kind:
       return strcmp (name, "gpu") == 0;
     case omp_device_arch:
-      return strcmp (name, "gcn") == 0;
+      return strcmp (name, "amdgcn") == 0 || strcmp (name, "gcn") == 0;
     case omp_device_isa:
       if (strcmp (name, "fiji") == 0)
 	return gcn_arch == PROCESSOR_FIJI;
@@ -2642,6 +2648,8 @@ gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
 	return gcn_arch == PROCESSOR_VEGA20;
       if (strcmp (name, "gfx908") == 0)
 	return gcn_arch == PROCESSOR_GFX908;
+      if (strcmp (name, "gfx90a") == 0)
+	return gcn_arch == PROCESSOR_GFX90a;
       return 0;
     default:
       gcc_unreachable ();
@@ -3081,13 +3089,35 @@ gcn_expand_prologue ()
   /* Ensure that the scheduler doesn't do anything unexpected.  */
   emit_insn (gen_blockage ());
 
-  /* m0 is initialized for the usual LDS DS and FLAT memory case.
-     The low-part is the address of the topmost addressable byte, which is
-     size-1.  The high-part is an offset and should be zero.  */
-  emit_move_insn (gen_rtx_REG (SImode, M0_REG),
-		  gen_int_mode (LDS_SIZE, SImode));
+  if (TARGET_M0_LDS_LIMIT)
+  {
+    /* m0 is initialized for the usual LDS DS and FLAT memory case.
+       The low-part is the address of the topmost addressable byte, which is
+       size-1.  The high-part is an offset and should be zero.  */
+    emit_move_insn (gen_rtx_REG (SImode, M0_REG),
+	gen_int_mode (LDS_SIZE, SImode));
+
+    emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
+  }
 
-  emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
+  if (TARGET_PACKED_WORK_ITEMS
+      && cfun && cfun->machine && !cfun->machine->normal_function)
+  {
+    /* v0 conatins the X, Y and Z dimensions all in one.
+       Expand them out for ABI compatibility.  */
+    /* TODO: implement and use zero_extract.  */
+    rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+    emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+	       gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10)));
+    emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10)));
+    emit_insn (gen_prologue_use (v1));
+
+    rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2));
+    emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+	       gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20)));
+    emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20)));
+    emit_insn (gen_prologue_use (v2));
+  }
 
   if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
     {
@@ -4131,10 +4161,13 @@ gcn_make_vec_perm_address (unsigned int *perm)
    permutations.  */
 
 static bool
-gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
-			      rtx src0, rtx src1,
+gcn_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+			      rtx dst, rtx src0, rtx src1,
 			      const vec_perm_indices & sel)
 {
+  if (vmode != op_mode)
+    return false;
+
   unsigned int nelt = GET_MODE_NUNITS (vmode);
 
   gcc_assert (VECTOR_MODE_P (vmode));
@@ -5216,71 +5249,41 @@ gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
 static void
 output_file_start (void)
 {
+  /* In HSACOv4 no attribute setting means the binary supports "any" hardware
+     configuration.  In GCC binaries, this is true for SRAM ECC, but not
+     XNACK.  */
+  const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-");
+  const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+"
+			  : flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-"
+			  : "");
+
   const char *cpu;
-  bool use_xnack_attr = true;
-  bool use_sram_attr = true;
   switch (gcn_arch)
     {
     case PROCESSOR_FIJI:
       cpu = "gfx803";
-#ifndef HAVE_GCN_XNACK_FIJI
-      use_xnack_attr = false;
-#endif
-      use_sram_attr = false;
+      xnack = "";
+      sram_ecc = "";
       break;
     case PROCESSOR_VEGA10:
       cpu = "gfx900";
-#ifndef HAVE_GCN_XNACK_GFX900
-      use_xnack_attr = false;
-#endif
-      use_sram_attr = false;
+      sram_ecc = "";
       break;
     case PROCESSOR_VEGA20:
       cpu = "gfx906";
-#ifndef HAVE_GCN_XNACK_GFX906
-      use_xnack_attr = false;
-#endif
-      use_sram_attr = false;
+      sram_ecc = "";
       break;
     case PROCESSOR_GFX908:
       cpu = "gfx908";
-#ifndef HAVE_GCN_XNACK_GFX908
-      use_xnack_attr = false;
-#endif
-#ifndef HAVE_GCN_SRAM_ECC_GFX908
-      use_sram_attr = false;
-#endif
+      break;
+    case PROCESSOR_GFX90a:
+      cpu = "gfx90a";
       break;
     default: gcc_unreachable ();
     }
 
-#if HAVE_GCN_ASM_V3_SYNTAX
-  const char *xnack = (flag_xnack ? "+xnack" : "");
-  const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : "");
-#endif
-#if HAVE_GCN_ASM_V4_SYNTAX
-  /* In HSACOv4 no attribute setting means the binary supports "any" hardware
-     configuration.  In GCC binaries, this is true for SRAM ECC, but not
-     XNACK.  */
-  const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-");
-  const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+"
-			  : flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-"
-			  : "");
-#endif
-  if (!use_xnack_attr)
-    xnack = "";
-  if (!use_sram_attr)
-    sram_ecc = "";
-
   fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
-	  cpu,
-#if HAVE_GCN_ASM_V3_SYNTAX
-	  xnack, sram_ecc
-#endif
-#ifdef HAVE_GCN_ASM_V4_SYNTAX
-	  sram_ecc, xnack
-#endif
-	  );
+	  cpu, sram_ecc, xnack);
 }
 
 /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
@@ -5329,6 +5332,10 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 	sgpr = MAX_NORMAL_SGPR_COUNT;
     }
 
+  /* The gfx90a accum_offset field can't represent 0 registers.  */
+  if (gcn_arch == PROCESSOR_GFX90a && vgpr < 4)
+    vgpr = 4;
+
   fputs ("\t.rodata\n"
 	 "\t.p2align\t6\n"
 	 "\t.amdhsa_kernel\t", file);
@@ -5397,6 +5404,11 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 	      one 64th the wave-front stack size.  */
 	   stack_size_opt / 64,
 	   LDS_SIZE);
+  if (gcn_arch == PROCESSOR_GFX90a)
+    fprintf (file,
+	     "\t  .amdhsa_accum_offset\t%i\n"
+	     "\t  .amdhsa_tg_split\t0\n",
+	     (vgpr+3)&~3); // I think this means the AGPRs come after the VGPRs
   fputs ("\t.end_amdhsa_kernel\n", file);
 
 #if 1
@@ -5425,6 +5437,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 	   LDS_SIZE,
 	   stack_size_opt / 64,
 	   sgpr, vgpr);
+  if (gcn_arch == PROCESSOR_GFX90a)
+    fprintf (file, "            .agpr_count: 0\n"); // AGPRs are not used, yet
   fputs ("        .end_amdgpu_metadata\n", file);
 #endif
 
@@ -5588,8 +5602,9 @@ gcn_print_lds_decl (FILE *f, tree var)
       fprintf (f, "%u", gang_private_hwm);
       gang_private_hwm += size;
       if (gang_private_hwm > gang_private_size_opt)
-	error ("gang-private data-share memory exhausted (increase with "
-	       "%<-mgang-private-size=<number>%>)");
+	error ("%d bytes of gang-private data-share memory exhausted"
+	       " (increase with %<-mgang-private-size=%d%>, for example)",
+	       gang_private_size_opt, gang_private_hwm);
     }
 }
 
@@ -5723,23 +5738,10 @@ print_operand_address (FILE *file, rtx mem)
 	      if (vgpr_offset == NULL_RTX)
 		/* In this case, the vector offset is zero, so we use the first
 		   lane of v1, which is initialized to zero.  */
-		{
-		  if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
-		    fprintf (file, "v1");
-		  else
-		    fprintf (file, "v[1:2]");
-		}
+		fprintf (file, "v1");
 	      else if (REG_P (vgpr_offset)
 		       && VGPR_REGNO_P (REGNO (vgpr_offset)))
-		{
-		  if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
-		    fprintf (file, "v%d",
-			     REGNO (vgpr_offset) - FIRST_VGPR_REG);
-		  else
-		    fprintf (file, "v[%d:%d]",
-			     REGNO (vgpr_offset) - FIRST_VGPR_REG,
-			     REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
-		}
+		fprintf (file, "v%d", REGNO (vgpr_offset) - FIRST_VGPR_REG);
 	      else
 		output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
 	    }
diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h
index 9ae8919..a129760 100644
--- a/gcc/config/gcn/gcn.h
+++ b/gcc/config/gcn/gcn.h
@@ -24,6 +24,10 @@
 	builtin_define ("__GCN3__");	\
       else if (TARGET_GCN5)		\
 	builtin_define ("__GCN5__");	\
+      else if (TARGET_CDNA1)		\
+	builtin_define ("__CDNA1__");	\
+      else if (TARGET_CDNA2)		\
+	builtin_define ("__CDNA2__");	\
     }					\
   while(0)
 
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 21a7476..53e846e 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -1410,7 +1410,7 @@
   ""
 {
   if (can_create_pseudo_p ()
-      && !TARGET_GCN5
+      && !TARGET_GCN5_PLUS
       && !gcn_inline_immediate_operand (operands[2], SImode))
     operands[2] = force_reg (SImode, operands[2]);
 
@@ -1451,7 +1451,7 @@
 		(match_operand:SI 1 "register_operand"         "Sg,Sg,v"))
 	      (match_operand:DI 2 "gcn_32bit_immediate_operand" "A, B,A"))
 	    (const_int 32))))]
-  "TARGET_GCN5 || gcn_inline_immediate_operand (operands[2], SImode)"
+  "TARGET_GCN5_PLUS || gcn_inline_immediate_operand (operands[2], SImode)"
   "@
   s_mul_hi<sgnsuffix>0\t%0, %1, %2
   s_mul_hi<sgnsuffix>0\t%0, %1, %2
@@ -1469,7 +1469,7 @@
   ""
 {
   if (can_create_pseudo_p ()
-      && !TARGET_GCN5
+      && !TARGET_GCN5_PLUS
       && !gcn_inline_immediate_operand (operands[2], SImode))
     operands[2] = force_reg (SImode, operands[2]);
 
@@ -1506,7 +1506,7 @@
 		   (match_operand:SI 1 "register_operand"       "Sg, Sg, v"))
 		 (match_operand:DI 2 "gcn_32bit_immediate_operand"
 								 "A,  B, A")))]
-  "TARGET_GCN5 || gcn_inline_immediate_operand (operands[2], SImode)"
+  "TARGET_GCN5_PLUS || gcn_inline_immediate_operand (operands[2], SImode)"
   "#"
   "&& reload_completed"
   [(const_int 0)]
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
index 54da11f..9606aaf 100644
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -37,6 +37,9 @@ Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA20)
 EnumValue
 Enum(gpu_type) String(gfx908) Value(PROCESSOR_GFX908)
 
+EnumValue
+Enum(gpu_type) String(gfx90a) Value(PROCESSOR_GFX90a)
+
 march=
 Target RejectNegative Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_FIJI)
 Specify the name of the target GPU.
diff --git a/gcc/config/gcn/mkoffload.cc b/gcc/config/gcn/mkoffload.cc
index 94ba7ff..ed93ae8 100644
--- a/gcc/config/gcn/mkoffload.cc
+++ b/gcc/config/gcn/mkoffload.cc
@@ -55,9 +55,8 @@
 #define EF_AMDGPU_MACH_AMDGCN_GFX906 0x2f
 #undef  EF_AMDGPU_MACH_AMDGCN_GFX908
 #define EF_AMDGPU_MACH_AMDGCN_GFX908 0x30
-
-#define EF_AMDGPU_XNACK_V3    0x100
-#define EF_AMDGPU_SRAM_ECC_V3 0x200
+#undef  EF_AMDGPU_MACH_AMDGCN_GFX90a
+#define EF_AMDGPU_MACH_AMDGCN_GFX90a 0x3f
 
 #define EF_AMDGPU_FEATURE_XNACK_V4	0x300  /* Mask.  */
 #define EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4	0x000
@@ -71,19 +70,6 @@
 #define EF_AMDGPU_FEATURE_SRAMECC_OFF_V4	0x800
 #define EF_AMDGPU_FEATURE_SRAMECC_ON_V4		0xc00
 
-#ifdef HAVE_GCN_ASM_V3_SYNTAX
-#define SET_XNACK_ON(VAR) VAR |= EF_AMDGPU_XNACK_V3
-#define SET_XNACK_OFF(VAR) VAR &= ~EF_AMDGPU_XNACK_V3
-#define TEST_XNACK(VAR) (VAR & EF_AMDGPU_XNACK_V3)
-
-#define SET_SRAM_ECC_ON(VAR) VAR |= EF_AMDGPU_SRAM_ECC_V3
-#define SET_SRAM_ECC_ANY(VAR) SET_SRAM_ECC_ON (VAR)
-#define SET_SRAM_ECC_OFF(VAR) VAR &= ~EF_AMDGPU_SRAM_ECC_V3
-#define SET_SRAM_ECC_UNSUPPORTED(VAR) SET_SRAM_ECC_OFF (VAR)
-#define TEST_SRAM_ECC_ANY(VAR) 0 /* Not supported.  */
-#define TEST_SRAM_ECC_ON(VAR) (VAR & EF_AMDGPU_SRAM_ECC_V3)
-#endif
-#ifdef HAVE_GCN_ASM_V4_SYNTAX
 #define SET_XNACK_ON(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_XNACK_V4) \
 				 | EF_AMDGPU_FEATURE_XNACK_ON_V4)
 #define SET_XNACK_OFF(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_XNACK_V4) \
@@ -104,7 +90,6 @@
 				== EF_AMDGPU_FEATURE_SRAMECC_ANY_V4)
 #define TEST_SRAM_ECC_ON(VAR) ((VAR & EF_AMDGPU_FEATURE_SRAMECC_V4) \
 			       == EF_AMDGPU_FEATURE_SRAMECC_ON_V4)
-#endif
 
 #ifndef R_AMDGPU_NONE
 #define R_AMDGPU_NONE		0
@@ -130,12 +115,7 @@ static struct obstack files_to_cleanup;
 enum offload_abi offload_abi = OFFLOAD_ABI_UNSET;
 uint32_t elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX803;  // Default GPU architecture.
 uint32_t elf_flags =
-#ifdef HAVE_GCN_ASM_V3_SYNTAX
-    0;
-#endif
-#ifdef HAVE_GCN_ASM_V4_SYNTAX
     (EF_AMDGPU_FEATURE_XNACK_ANY_V4 | EF_AMDGPU_FEATURE_SRAMECC_ANY_V4);
-#endif
 
 /* Delete tempfiles.  */
 
@@ -362,14 +342,9 @@ copy_early_debug_info (const char *infile, const char *outfile)
 
   /* Patch the correct elf architecture flag into the file.  */
   ehdr.e_ident[7] = ELFOSABI_AMDGPU_HSA;
-#ifdef HAVE_GCN_ASM_V3_SYNTAX
-  ehdr.e_ident[8] = ELFABIVERSION_AMDGPU_HSA_V3;
-#endif
-#ifdef HAVE_GCN_ASM_V4_SYNTAX
   ehdr.e_ident[8] = (elf_arch == EF_AMDGPU_MACH_AMDGCN_GFX803
 		     ? ELFABIVERSION_AMDGPU_HSA_V3
 		     : ELFABIVERSION_AMDGPU_HSA_V4);
-#endif
   ehdr.e_type = ET_REL;
   ehdr.e_machine = EM_AMDGPU;
   ehdr.e_flags = elf_arch | elf_flags_actual;
@@ -884,7 +859,6 @@ main (int argc, char **argv)
   bool fopenacc = false;
   bool fPIC = false;
   bool fpic = false;
-  bool sram_seen = false;
   for (int i = 1; i < argc; i++)
     {
 #define STR "-foffload-abi="
@@ -912,20 +886,11 @@ main (int argc, char **argv)
       else if (strcmp (argv[i], "-mno-xnack") == 0)
 	SET_XNACK_OFF (elf_flags);
       else if (strcmp (argv[i], "-msram-ecc=on") == 0)
-	{
-	  SET_SRAM_ECC_ON (elf_flags);
-	  sram_seen = true;
-	}
+	SET_SRAM_ECC_ON (elf_flags);
       else if (strcmp (argv[i], "-msram-ecc=any") == 0)
-	{
-	  SET_SRAM_ECC_ANY (elf_flags);
-	  sram_seen = true;
-	}
+	SET_SRAM_ECC_ANY (elf_flags);
       else if (strcmp (argv[i], "-msram-ecc=off") == 0)
-	{
-	  SET_SRAM_ECC_OFF (elf_flags);
-	  sram_seen = true;
-	}
+	SET_SRAM_ECC_OFF (elf_flags);
       else if (strcmp (argv[i], "-save-temps") == 0)
 	save_temps = true;
       else if (strcmp (argv[i], "-v") == 0)
@@ -941,33 +906,13 @@ main (int argc, char **argv)
 	elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX906;
       else if (strcmp (argv[i], "-march=gfx908") == 0)
 	elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX908;
+      else if (strcmp (argv[i], "-march=gfx90a") == 0)
+	elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX90a;
     }
 
   if (!(fopenacc ^ fopenmp))
     fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
 
-  if (!sram_seen)
-    {
-#ifdef HAVE_GCN_ASM_V3_SYNTAX
-      /* For HSACOv3, the SRAM-ECC feature defaults to "on" on GPUs where the
-	 feature is available.
-	 (HSACOv4 has elf_flags initialsed to "any" in all cases.)  */
-      switch (elf_arch)
-	{
-	case EF_AMDGPU_MACH_AMDGCN_GFX803:
-	case EF_AMDGPU_MACH_AMDGCN_GFX900:
-	case EF_AMDGPU_MACH_AMDGCN_GFX906:
-#ifndef HAVE_GCN_SRAM_ECC_GFX908
-	case EF_AMDGPU_MACH_AMDGCN_GFX908:
-#endif
-	  break;
-	default:
-	  SET_SRAM_ECC_ON (elf_flags);
-	  break;
-	}
-#endif
-    }
-
   const char *abi;
   switch (offload_abi)
     {
diff --git a/gcc/config/gcn/t-gcn-hsa b/gcc/config/gcn/t-gcn-hsa
index 10e31f3..9e03ec8 100644
--- a/gcc/config/gcn/t-gcn-hsa
+++ b/gcc/config/gcn/t-gcn-hsa
@@ -42,8 +42,8 @@ ALL_HOST_OBJS += gcn-run.o
 gcn-run$(exeext): gcn-run.o
 	+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ $< -ldl
 
-MULTILIB_OPTIONS = march=gfx900/march=gfx906/march=gfx908
-MULTILIB_DIRNAMES = gfx900 gfx906 gfx908
+MULTILIB_OPTIONS = march=gfx900/march=gfx906/march=gfx908/march=gfx90a
+MULTILIB_DIRNAMES = gfx900 gfx906 gfx908 gfx90a
 
 gcn-tree.o: $(srcdir)/config/gcn/gcn-tree.cc
 	$(COMPILE) $<
diff --git a/gcc/config/gcn/t-omp-device b/gcc/config/gcn/t-omp-device
index cd56e2f..27d36db 100644
--- a/gcc/config/gcn/t-omp-device
+++ b/gcc/config/gcn/t-omp-device
@@ -1,4 +1,4 @@
 omp-device-properties-gcn: $(srcdir)/config/gcn/gcn.cc
 	echo kind: gpu > $@
-	echo arch: gcn >> $@
-	echo isa: fiji gfx900 gfx906 gfx908 >> $@
+	echo arch: amdgcn gcn >> $@
+	echo isa: fiji gfx900 gfx906 gfx908 gfx90a >> $@
diff --git a/gcc/config/host-linux.cc b/gcc/config/host-linux.cc
index ff4557d..817d3c0 100644
--- a/gcc/config/host-linux.cc
+++ b/gcc/config/host-linux.cc
@@ -98,6 +98,8 @@
 # define TRY_EMPTY_VM_SPACE	0x60000000
 #elif defined(__riscv) && defined (__LP64__)
 # define TRY_EMPTY_VM_SPACE	0x1000000000
+#elif defined(__loongarch__) && defined(__LP64__)
+# define TRY_EMPTY_VM_SPACE	0x8000000000
 #else
 # define TRY_EMPTY_VM_SPACE	0
 #endif
diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h
index 29511fd..77d6249 100644
--- a/gcc/config/i386/avx512fintrin.h
+++ b/gcc/config/i386/avx512fintrin.h
@@ -3286,31 +3286,67 @@ _mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R)
 						      (__mmask8) __U, __R);
 }
 #else
-#define _mm512_scalef_round_pd(A, B, C)            \
-    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
-
-#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \
-    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, W, U, C)
-
-#define _mm512_maskz_scalef_round_pd(U, A, B, C)   \
-    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+#define _mm512_scalef_round_pd(A, B, C)					\
+  ((__m512d)								\
+   __builtin_ia32_scalefpd512_mask((A), (B),				\
+				   (__v8df) _mm512_undefined_pd(),	\
+				   -1, (C)))
+
+#define _mm512_mask_scalef_round_pd(W, U, A, B, C)			\
+  ((__m512d) __builtin_ia32_scalefpd512_mask((A), (B), (W), (U), (C)))
+
+#define _mm512_maskz_scalef_round_pd(U, A, B, C)			\
+  ((__m512d)								\
+   __builtin_ia32_scalefpd512_mask((A), (B),				\
+				   (__v8df) _mm512_setzero_pd(),	\
+				   (U), (C)))
+
+#define _mm512_scalef_round_ps(A, B, C)					\
+  ((__m512)								\
+   __builtin_ia32_scalefps512_mask((A), (B),				\
+				   (__v16sf) _mm512_undefined_ps(),	\
+				   -1, (C)))
+
+#define _mm512_mask_scalef_round_ps(W, U, A, B, C)			\
+  ((__m512) __builtin_ia32_scalefps512_mask((A), (B), (W), (U), (C)))
+
+#define _mm512_maskz_scalef_round_ps(U, A, B, C)			\
+  ((__m512)								\
+   __builtin_ia32_scalefps512_mask((A), (B),				\
+				   (__v16sf) _mm512_setzero_ps(),	\
+				   (U), (C)))
+
+#define _mm_scalef_round_sd(A, B, C)					\
+  ((__m128d)								\
+   __builtin_ia32_scalefsd_mask_round ((A), (B),			\
+				       (__v2df) _mm_undefined_pd (),	\
+				       -1, (C)))
 
-#define _mm512_scalef_round_ps(A, B, C)            \
-    (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+#define _mm_scalef_round_ss(A, B, C)					\
+  ((__m128)								\
+   __builtin_ia32_scalefss_mask_round ((A), (B),			\
+				       (__v4sf) _mm_undefined_ps (),	\
+				       -1, (C)))
 
-#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \
-    (__m512)__builtin_ia32_scalefps512_mask(A, B, W, U, C)
+#define _mm_mask_scalef_round_sd(W, U, A, B, C)				\
+  ((__m128d)								\
+   __builtin_ia32_scalefsd_mask_round ((A), (B), (W), (U), (C)))
 
-#define _mm512_maskz_scalef_round_ps(U, A, B, C)   \
-    (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+#define _mm_mask_scalef_round_ss(W, U, A, B, C)				\
+  ((__m128)								\
+   __builtin_ia32_scalefss_mask_round ((A), (B), (W), (U), (C)))
 
-#define _mm_scalef_round_sd(A, B, C)            \
-    (__m128d)__builtin_ia32_scalefsd_mask_round (A, B, \
-	(__v2df)_mm_setzero_pd (), -1, C)
+#define _mm_maskz_scalef_round_sd(U, A, B, C)				\
+  ((__m128d)								\
+   __builtin_ia32_scalefsd_mask_round ((A), (B),			\
+				       (__v2df) _mm_setzero_pd (),	\
+				       (U), (C)))
 
-#define _mm_scalef_round_ss(A, B, C)            \
-    (__m128)__builtin_ia32_scalefss_mask_round (A, B, \
-	(__v4sf)_mm_setzero_ps (), -1, C)
+#define _mm_maskz_scalef_round_ss(U, A, B, C)				\
+  ((__m128)								\
+   __builtin_ia32_scalefss_mask_round ((A), (B),			\
+				       (__v4sf) _mm_setzero_ps (),	\
+				       (U), (C)))
 #endif
 
 #define _mm_mask_scalef_sd(W, U, A, B) \
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 8b3dc2b..a4c2fed 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -204,6 +204,10 @@
 #define signature_VORTEX_ecx	0x436f5320
 #define signature_VORTEX_edx	0x36387865
 
+#define signature_SHANGHAI_ebx	0x68532020
+#define signature_SHANGHAI_ecx	0x20206961
+#define signature_SHANGHAI_edx	0x68676e61
+
 #ifndef __x86_64__
 /* At least one cpu (Winchip 2) does not set %ebx and %ecx
    for cpuid leaf 1. Forcibly zero the two registers before
diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
index 9e0ae0b..3c702fd 100644
--- a/gcc/config/i386/driver-i386.cc
+++ b/gcc/config/i386/driver-i386.cc
@@ -438,7 +438,8 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 	  || vendor == VENDOR_CYRIX
 	  || vendor == VENDOR_NSC)
 	cache = detect_caches_amd (ext_level);
-      else if (vendor == VENDOR_INTEL)
+      else if (vendor == VENDOR_INTEL
+			 || vendor == VENDOR_ZHAOXIN)
 	{
 	  bool xeon_mp = (family == 15 && model == 6);
 	  cache = detect_caches_intel (xeon_mp, max_level,
@@ -518,6 +519,20 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 	    processor = PROCESSOR_I486;
 	}
     }
+  else if (vendor == VENDOR_ZHAOXIN)
+    {
+      processor = PROCESSOR_GENERIC;
+
+      switch (family)
+	{
+	case 7:
+	  if (model == 0x3b)
+	    processor = PROCESSOR_LUJIAZUI;
+	  break;
+	default:
+	  break;
+	}
+    }
   else
     {
       switch (family)
@@ -773,6 +788,9 @@ const char *host_detect_local_cpu (int argc, const char **argv)
     case PROCESSOR_BTVER2:
       cpu = "btver2";
       break;
+    case PROCESSOR_LUJIAZUI:
+      cpu = "lujiazui";
+      break;
 
     default:
       /* Use something reasonable.  */
diff --git a/gcc/config/i386/gnu-user-common.h b/gcc/config/i386/gnu-user-common.h
index 23b54c5..cab9be2 100644
--- a/gcc/config/i386/gnu-user-common.h
+++ b/gcc/config/i386/gnu-user-common.h
@@ -66,7 +66,8 @@ along with GCC; see the file COPYING3.  If not see
 #define STACK_CHECK_STATIC_BUILTIN 1
 
 /* We only build the -fsplit-stack support in libgcc if the
-   assembler has full support for the CFI directives.  */
-#if HAVE_GAS_CFI_PERSONALITY_DIRECTIVE
+   assembler has full support for the CFI directives.  Also
+   we only support -fsplit-stack on glibc targets.  */
+#if (DEFAULT_LIBC == LIBC_GLIBC) && HAVE_GAS_CFI_PERSONALITY_DIRECTIVE
 #define TARGET_CAN_SPLIT_STACK
 #endif
diff --git a/gcc/config/i386/gnu.h b/gcc/config/i386/gnu.h
index 401e60c..fb8d69a 100644
--- a/gcc/config/i386/gnu.h
+++ b/gcc/config/i386/gnu.h
@@ -41,8 +41,9 @@ along with GCC.  If not, see <http://www.gnu.org/licenses/>.
 #define TARGET_THREAD_SSP_OFFSET        0x14
 
 /* We only build the -fsplit-stack support in libgcc if the
-   assembler has full support for the CFI directives.  */
-#if HAVE_GAS_CFI_PERSONALITY_DIRECTIVE
+   assembler has full support for the CFI directives.  Also
+   we only support -fsplit-stack on glibc targets.  */
+#if (DEFAULT_LIBC == LIBC_GLIBC) && HAVE_GAS_CFI_PERSONALITY_DIRECTIVE
 #define TARGET_CAN_SPLIT_STACK
 #endif
 /* We steal the last transactional memory word.  */
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index e33f06a..7a2da1d 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -516,6 +516,7 @@ DEF_FUNCTION_TYPE (V8DI, V8DI, V2DI, INT)
 DEF_FUNCTION_TYPE (V8DI, V8DI, V2DI, INT, V8DI, UQI)
 DEF_FUNCTION_TYPE (V8DI, V8DI, V4DI, INT, V8DI, UQI)
 DEF_FUNCTION_TYPE (V4DI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V4DI, V32QI, V32QI)
 DEF_FUNCTION_TYPE (V8DI, V64QI, V64QI)
 DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI)
 DEF_FUNCTION_TYPE (V4DI, PCV4DI, V4DI)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index ea32755..e6daad4 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -906,7 +906,7 @@ BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_palignrdi,
 /* SSE4.1 */
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
@@ -1217,7 +1217,7 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256"
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256"  , IX86_BUILTIN_PMULLD256  , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V4DI_FTYPE_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT)
diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
index 2570501..96743e6 100644
--- a/gcc/config/i386/i386-builtins.cc
+++ b/gcc/config/i386/i386-builtins.cc
@@ -1785,7 +1785,12 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
   bool si;
   enum ix86_builtins code;
 
-  if (! TARGET_AVX2 || !TARGET_USE_GATHER)
+  if (! TARGET_AVX2
+      || (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 2u)
+	  ? !TARGET_USE_GATHER_2PARTS
+	  : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u)
+	     ? !TARGET_USE_GATHER_4PARTS
+	     : !TARGET_USE_GATHER)))
     return NULL_TREE;
 
   if ((TREE_CODE (index_type) != INTEGER_TYPE
@@ -1931,8 +1936,7 @@ get_builtin_code_for_version (tree decl, tree *predicate_list)
 
   enum feature_priority priority = P_NONE;
 
-  static unsigned int NUM_FEATURES
-    = sizeof (isa_names_table) / sizeof (_isa_names_table);
+  static unsigned int NUM_FEATURES = ARRAY_SIZE (isa_names_table);
 
   unsigned int i;
 
@@ -2275,7 +2279,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
       /* Check the value.  */
       final = build2 (EQ_EXPR, unsigned_type_node, ref,
 		      build_int_cstu (unsigned_type_node, field_val));
-      return build1 (CONVERT_EXPR, integer_type_node, final);
+      return build1 (NOP_EXPR, integer_type_node, final);
     }
   else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
     {
@@ -2285,8 +2289,7 @@ fold_builtin_cpu (tree fndecl, tree *args)
       tree final;
 
       unsigned int field_val = 0;
-      unsigned int NUM_ISA_NAMES
-	= sizeof (isa_names_table) / sizeof (struct _isa_names_table);
+      unsigned int NUM_ISA_NAMES = ARRAY_SIZE (isa_names_table);
 
       for (i = 0; i < NUM_ISA_NAMES; i++)
 	if (strcmp (isa_names_table[i].name,
@@ -2300,7 +2303,8 @@ fold_builtin_cpu (tree fndecl, tree *args)
 	  return integer_zero_node;
 	}
 
-      if (isa_names_table[i].feature >= 32)
+      unsigned feature = isa_names_table[i].feature;
+      if (feature >= INT_TYPE_SIZE)
 	{
 	  if (ix86_cpu_features2_var == nullptr)
 	    {
@@ -2318,46 +2322,44 @@ fold_builtin_cpu (tree fndecl, tree *args)
 	      varpool_node::add (ix86_cpu_features2_var);
 	    }
 
-	  for (unsigned int j = 0; j < SIZE_OF_CPU_FEATURES; j++)
-	    if (isa_names_table[i].feature < (32 + 32 + j * 32))
-	      {
-		field_val = (1U << (isa_names_table[i].feature
-				    - (32 + j * 32)));
-		tree index = size_int (j);
-		array_elt = build4 (ARRAY_REF, unsigned_type_node,
-				    ix86_cpu_features2_var,
-				    index, NULL_TREE, NULL_TREE);
-		/* Return __cpu_features2[index] & field_val  */
-		final = build2 (BIT_AND_EXPR, unsigned_type_node,
-				array_elt,
-				build_int_cstu (unsigned_type_node,
-						field_val));
-		return build1 (CONVERT_EXPR, integer_type_node, final);
-	      }
+	  feature -= INT_TYPE_SIZE;
+	  field_val = 1U << (feature % INT_TYPE_SIZE);
+	  tree index = size_int (feature / INT_TYPE_SIZE);
+	  array_elt = build4 (ARRAY_REF, unsigned_type_node,
+			      ix86_cpu_features2_var,
+			      index, NULL_TREE, NULL_TREE);
+	  /* Return __cpu_features2[index] & field_val  */
+	  final = build2 (BIT_AND_EXPR, unsigned_type_node,
+			  array_elt,
+			  build_int_cstu (unsigned_type_node,
+					  field_val));
+	  return build1 (NOP_EXPR, integer_type_node, final);
 	}
-
-      field = TYPE_FIELDS (ix86_cpu_model_type_node);
-      /* Get the last field, which is __cpu_features.  */
-      while (DECL_CHAIN (field))
-        field = DECL_CHAIN (field);
-
-      /* Get the appropriate field: __cpu_model.__cpu_features  */
-      ref = build3 (COMPONENT_REF, TREE_TYPE (field), ix86_cpu_model_var,
-		    field, NULL_TREE);
-
-      /* Access the 0th element of __cpu_features array.  */
-      array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
-			  integer_zero_node, NULL_TREE, NULL_TREE);
-
-      field_val = (1U << isa_names_table[i].feature);
-      /* Return __cpu_model.__cpu_features[0] & field_val  */
-      final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
-		      build_int_cstu (unsigned_type_node, field_val));
-      if (isa_names_table[i].feature == (INT_TYPE_SIZE - 1))
-	return build2 (NE_EXPR, integer_type_node, final,
-		       build_int_cst (unsigned_type_node, 0));
       else
-	return build1 (CONVERT_EXPR, integer_type_node, final);
+	{
+	  field = TYPE_FIELDS (ix86_cpu_model_type_node);
+	  /* Get the last field, which is __cpu_features.  */
+	  while (DECL_CHAIN (field))
+	    field = DECL_CHAIN (field);
+
+	  /* Get the appropriate field: __cpu_model.__cpu_features  */
+	  ref = build3 (COMPONENT_REF, TREE_TYPE (field), ix86_cpu_model_var,
+			field, NULL_TREE);
+
+	  /* Access the 0th element of __cpu_features array.  */
+	  array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
+			      integer_zero_node, NULL_TREE, NULL_TREE);
+
+	  field_val = (1U << feature);
+	  /* Return __cpu_model.__cpu_features[0] & field_val  */
+	  final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
+			  build_int_cstu (unsigned_type_node, field_val));
+	  if (feature == (INT_TYPE_SIZE - 1))
+	    return build2 (NE_EXPR, integer_type_node, final,
+			   build_int_cst (unsigned_type_node, 0));
+	  else
+	    return build1 (NOP_EXPR, integer_type_node, final);
+	}
     }
   gcc_unreachable ();
 }
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index c73c1b1..eb0e3b3 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -140,6 +140,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
       def_or_undef (parse_in, "__btver2");
       def_or_undef (parse_in, "__btver2__");
       break;
+    case PROCESSOR_LUJIAZUI:
+      def_or_undef (parse_in, "__lujiazui");
+      def_or_undef (parse_in, "__lujiazui__");
+      break;
     case PROCESSOR_PENTIUM4:
       def_or_undef (parse_in, "__pentium4");
       def_or_undef (parse_in, "__pentium4__");
@@ -332,6 +336,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     case PROCESSOR_BTVER2:
       def_or_undef (parse_in, "__tune_btver2__");
        break;
+    case PROCESSOR_LUJIAZUI:
+      def_or_undef (parse_in, "__tune_lujiazui__");
+       break;
     case PROCESSOR_PENTIUM4:
       def_or_undef (parse_in, "__tune_pentium4__");
       break;
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index e85641d..8bc5430 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2267,12 +2267,20 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
 
   /* Handle special case - vector comparsion with boolean result, transform
      it using ptest instruction.  */
-  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+      || mode == OImode)
     {
       rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
       machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
 
       gcc_assert (code == EQ || code == NE);
+
+      if (mode == OImode)
+	{
+	  op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+	  op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+	  mode = p_mode;
+	}
       /* Generate XOR since we can't check that one operand is zero vector.  */
       tmp = gen_reg_rtx (mode);
       emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
@@ -2309,21 +2317,15 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
     case E_DImode:
       if (TARGET_64BIT)
 	goto simple;
-      /* For 32-bit target DI comparison may be performed on
-	 SSE registers.  To allow this we should avoid split
-	 to SI mode which is achieved by doing xor in DI mode
-	 and then comparing with zero (which is recognized by
-	 STV pass).  We don't compare using xor when optimizing
-	 for size.  */
-      if (!optimize_insn_for_size_p ()
-	  && TARGET_STV
-	  && (code == EQ || code == NE))
-	{
-	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
-	  op1 = const0_rtx;
-	}
       /* FALLTHRU */
     case E_TImode:
+      /* DI and TI mode equality/inequality comparisons may be performed
+         on SSE registers.  Avoid splitting them, except when optimizing
+	 for size.  */
+      if ((code == EQ || code == NE)
+	  && !optimize_insn_for_size_p ())
+	goto simple;
+
       /* Expand DImode branch into multiple compare+branch.  */
       {
 	rtx lo[2], hi[2];
@@ -2342,34 +2344,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
 
 	submode = mode == DImode ? SImode : DImode;
 
-	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
-	   avoid two branches.  This costs one extra insn, so disable when
-	   optimizing for size.  */
-
-	if ((code == EQ || code == NE)
-	    && (!optimize_insn_for_size_p ()
-	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
-	  {
-	    rtx xor0, xor1;
-
-	    xor1 = hi[0];
-	    if (hi[1] != const0_rtx)
-	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
-				   NULL_RTX, 0, OPTAB_WIDEN);
-
-	    xor0 = lo[0];
-	    if (lo[1] != const0_rtx)
-	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
-				   NULL_RTX, 0, OPTAB_WIDEN);
-
-	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
-				NULL_RTX, 0, OPTAB_WIDEN);
-
-	    ix86_expand_branch (code, tmp, const0_rtx, label);
-	    return;
-	  }
-
-	/* Otherwise, if we are doing less-than or greater-or-equal-than,
+	/* If we are doing less-than or greater-or-equal-than,
 	   op1 is a constant and the low word is zero, then we can just
 	   examine the high word.  Similarly for low word -1 and
 	   less-or-equal-than or greater-than.  */
@@ -3134,8 +3109,11 @@ ix86_expand_int_movcc (rtx operands[])
   rtx compare_op;
   machine_mode mode = GET_MODE (operands[0]);
   bool sign_bit_compare_p = false;
+  bool negate_cc_compare_p = false;
   rtx op0 = XEXP (operands[1], 0);
   rtx op1 = XEXP (operands[1], 1);
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
 
   if (GET_MODE (op0) == TImode
       || (GET_MODE (op0) == DImode
@@ -3153,29 +3131,73 @@ ix86_expand_int_movcc (rtx operands[])
       || (op1 == constm1_rtx && (code == GT || code == LE)))
     sign_bit_compare_p = true;
 
+  /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
+     but if op1 is a constant, the latter form allows more optimizations,
+     either through the last 2 ops being constant handling, or the one
+     constant and one variable cases.  On the other side, for cmov the
+     former might be better as we don't need to load the constant into
+     another register.  */
+  if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
+    op2 = op1;
+  /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1.  */
+  else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
+    op3 = op1;
+
   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
      HImode insns, we'd be swallowed in word prefix ops.  */
 
   if ((mode != HImode || TARGET_FAST_PREFIX)
       && (mode != (TARGET_64BIT ? TImode : DImode))
-      && CONST_INT_P (operands[2])
-      && CONST_INT_P (operands[3]))
+      && CONST_INT_P (op2)
+      && CONST_INT_P (op3))
     {
       rtx out = operands[0];
-      HOST_WIDE_INT ct = INTVAL (operands[2]);
-      HOST_WIDE_INT cf = INTVAL (operands[3]);
+      HOST_WIDE_INT ct = INTVAL (op2);
+      HOST_WIDE_INT cf = INTVAL (op3);
       HOST_WIDE_INT diff;
 
+      if ((mode == SImode
+	   || (TARGET_64BIT && mode == DImode))
+	  && (GET_MODE (op0) == SImode
+	      || (TARGET_64BIT && GET_MODE (op0) == DImode)))
+	{
+	  /* Special case x != 0 ? -1 : y.  */
+	  if (code == NE && op1 == const0_rtx && ct == -1)
+	    {
+	      negate_cc_compare_p = true;
+	      std::swap (ct, cf);
+	      code = EQ;
+	    }
+	  else if (code == EQ && op1 == const0_rtx && cf == -1)
+	    negate_cc_compare_p = true;
+	}
+
       diff = ct - cf;
       /*  Sign bit compares are better done using shifts than we do by using
 	  sbb.  */
       if (sign_bit_compare_p
+	  || negate_cc_compare_p
 	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
 	{
 	  /* Detect overlap between destination and compare sources.  */
 	  rtx tmp = out;
 
-          if (!sign_bit_compare_p)
+	  if (negate_cc_compare_p)
+	    {
+	      if (GET_MODE (op0) == DImode)
+		emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
+	      else
+		emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
+					      gen_lowpart (SImode, op0)));
+
+	      tmp = gen_reg_rtx (mode);
+	      if (mode == DImode)
+		emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
+	      else
+		emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
+								  tmp)));
+	    }
+	  else if (!sign_bit_compare_p)
 	    {
 	      rtx flags;
 	      bool fpcmp = false;
@@ -3210,8 +3232,7 @@ ix86_expand_int_movcc (rtx operands[])
 		}
 	      diff = ct - cf;
 
-	      if (reg_overlap_mentioned_p (out, op0)
-		  || reg_overlap_mentioned_p (out, op1))
+	      if (reg_overlap_mentioned_p (out, compare_op))
 		tmp = gen_reg_rtx (mode);
 
 	      if (mode == DImode)
@@ -3559,6 +3580,9 @@ ix86_expand_int_movcc (rtx operands[])
       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
 	return false;
 
+      operands[2] = op2;
+      operands[3] = op3;
+
       /* If one of the two operands is an interesting constant, load a
 	 constant with the above and mask it in with a logical operation.  */
 
@@ -4002,6 +4026,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
     case E_V8HFmode:
     case E_V4SImode:
     case E_V2DImode:
+    case E_V1TImode:
       if (TARGET_SSE4_1)
 	{
 	  gen = gen_sse4_1_pblendvb;
@@ -10342,6 +10367,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V8SI_FTYPE_V16HI_V16HI:
     case V4DI_FTYPE_V4DI_V4DI:
     case V4DI_FTYPE_V8SI_V8SI:
+    case V4DI_FTYPE_V32QI_V32QI:
     case V8DI_FTYPE_V64QI_V64QI:
       if (comparison == UNKNOWN)
 	return ix86_expand_binop_builtin (icode, exp, target);
@@ -13556,6 +13582,9 @@ rdseed_step:
       return target;
 
     case IX86_BUILTIN_READ_FLAGS:
+      if (ignore)
+	return const0_rtx;
+
       emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
 
       if (optimize
@@ -15827,9 +15856,9 @@ quarter:
 	      else
 		{
 		  word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
-					      word, 1, OPTAB_LIB_WIDEN);
+					      NULL_RTX, 1, OPTAB_LIB_WIDEN);
 		  word = expand_simple_binop (tmp_mode, IOR, word, elt,
-					      word, 1, OPTAB_LIB_WIDEN);
+					      NULL_RTX, 1, OPTAB_LIB_WIDEN);
 		}
 	    }
 
@@ -17033,7 +17062,8 @@ ix86_emit_fp_unordered_jump (rtx label)
 
 /* Output code to perform an sinh XFmode calculation.  */
 
-void ix86_emit_i387_sinh (rtx op0, rtx op1)
+void
+ix86_emit_i387_sinh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17081,7 +17111,8 @@ void ix86_emit_i387_sinh (rtx op0, rtx op1)
 
 /* Output code to perform an cosh XFmode calculation.  */
 
-void ix86_emit_i387_cosh (rtx op0, rtx op1)
+void
+ix86_emit_i387_cosh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17103,7 +17134,8 @@ void ix86_emit_i387_cosh (rtx op0, rtx op1)
 
 /* Output code to perform an tanh XFmode calculation.  */
 
-void ix86_emit_i387_tanh (rtx op0, rtx op1)
+void
+ix86_emit_i387_tanh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17149,7 +17181,8 @@ void ix86_emit_i387_tanh (rtx op0, rtx op1)
 
 /* Output code to perform an asinh XFmode calculation.  */
 
-void ix86_emit_i387_asinh (rtx op0, rtx op1)
+void
+ix86_emit_i387_asinh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17201,7 +17234,8 @@ void ix86_emit_i387_asinh (rtx op0, rtx op1)
 
 /* Output code to perform an acosh XFmode calculation.  */
 
-void ix86_emit_i387_acosh (rtx op0, rtx op1)
+void
+ix86_emit_i387_acosh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17227,7 +17261,8 @@ void ix86_emit_i387_acosh (rtx op0, rtx op1)
 
 /* Output code to perform an atanh XFmode calculation.  */
 
-void ix86_emit_i387_atanh (rtx op0, rtx op1)
+void
+ix86_emit_i387_atanh (rtx op0, rtx op1)
 {
   rtx e1 = gen_reg_rtx (XFmode);
   rtx e2 = gen_reg_rtx (XFmode);
@@ -17278,7 +17313,8 @@ void ix86_emit_i387_atanh (rtx op0, rtx op1)
 
 /* Output code to perform a log1p XFmode calculation.  */
 
-void ix86_emit_i387_log1p (rtx op0, rtx op1)
+void
+ix86_emit_i387_log1p (rtx op0, rtx op1)
 {
   rtx_code_label *label1 = gen_label_rtx ();
   rtx_code_label *label2 = gen_label_rtx ();
@@ -17288,6 +17324,11 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1)
   rtx cst, cstln2, cst1;
   rtx_insn *insn;
 
+  /* The emit_jump call emits pending stack adjust, make sure it is emitted
+     before the conditional jump, otherwise the stack adjustment will be
+     only conditional.  */
+  do_pending_stack_adjust ();
+
   cst = const_double_from_real_value
     (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
   cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
@@ -17317,7 +17358,8 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1)
 }
 
 /* Emit code for round calculation.  */
-void ix86_emit_i387_round (rtx op0, rtx op1)
+void
+ix86_emit_i387_round (rtx op0, rtx op1)
 {
   machine_mode inmode = GET_MODE (op1);
   machine_mode outmode = GET_MODE (op0);
@@ -17431,7 +17473,8 @@ void ix86_emit_i387_round (rtx op0, rtx op1)
 /* Output code to perform a Newton-Rhapson approximation of a single precision
    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
 
-void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
+void
+ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
 {
   rtx x0, x1, e0, e1;
 
@@ -17482,7 +17525,8 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
 /* Output code to perform a Newton-Rhapson approximation of a
    single precision floating point [reciprocal] square root.  */
 
-void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
+void
+ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
 {
   rtx x0, e0, e1, e2, e3, mthree, mhalf;
   REAL_VALUE_TYPE r;
@@ -20907,6 +20951,101 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Implement permutation with pslldq + psrldq + por when pshufb is not
+   available.  */
+static bool
+expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
+{
+  unsigned i, nelt = d->nelt;
+  unsigned start1, end1 = -1;
+  machine_mode vmode = d->vmode, imode;
+  int start2 = -1;
+  bool clear_op0, clear_op1;
+  unsigned inner_size;
+  rtx op0, op1, dop1;
+  rtx (*gen_vec_shr) (rtx, rtx, rtx);
+  rtx (*gen_vec_shl) (rtx, rtx, rtx);
+
+  /* pshufd can be used for V4SI/V2DI under TARGET_SSE2.  */
+  if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
+    return false;
+
+  start1 = d->perm[0];
+  for (i = 1; i < nelt; i++)
+    {
+      if (d->perm[i] != d->perm[i-1] + 1
+	  || d->perm[i] == nelt)
+	{
+	  if (start2 == -1)
+	    {
+	      start2 = d->perm[i];
+	      end1 = d->perm[i-1];
+	    }
+	  else
+	    return false;
+	}
+    }
+
+  clear_op0 = end1 != nelt - 1;
+  clear_op1 = start2 % nelt != 0;
+  /* pandn/pand is needed to clear upper/lower bits of op0/op1.  */
+  if (!pandn && (clear_op0 || clear_op1))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
+  gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
+  imode = GET_MODE_INNER (vmode);
+  inner_size = GET_MODE_BITSIZE (imode);
+  op0 = gen_reg_rtx (vmode);
+  op1 = gen_reg_rtx (vmode);
+
+  if (start1)
+    emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
+  else
+    emit_move_insn (op0, d->op0);
+
+  dop1 = d->op1;
+  if (d->one_operand_p)
+    dop1 = d->op0;
+
+  int shl_offset = end1 - start1 + 1 - start2 % nelt;
+  if (shl_offset)
+    emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
+  else
+    emit_move_insn (op1, dop1);
+
+  /* Clear lower/upper bits for op0/op1.  */
+  if (clear_op0 || clear_op1)
+    {
+      rtx vec[16];
+      rtx const_vec;
+      rtx clear;
+      for (i = 0; i != nelt; i++)
+	{
+	  if (i < (end1 - start1 + 1))
+	    vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
+	  else
+	    vec[i] = CONST0_RTX (imode);
+	}
+      const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
+      const_vec = validize_mem (force_const_mem (vmode, const_vec));
+      clear = force_reg (vmode, const_vec);
+
+      if (clear_op0)
+	emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
+      if (clear_op1)
+	emit_move_insn (op1, gen_rtx_AND (vmode,
+					  gen_rtx_NOT (vmode, clear),
+					  op1));
+    }
+
+  emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
+  return true;
+}
+
 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
    and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
    operands with two "and" and "pack" or two "shift" and "pack" insns.
@@ -21819,6 +21958,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_pshufb2 (d))
     return true;
 
+  if (expand_vec_perm_pslldq_psrldq_por (d, false))
+    return true;
+
   if (expand_vec_perm_interleave3 (d))
     return true;
 
@@ -21857,6 +21999,10 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_even_odd (d))
     return true;
 
+  /* Generate four or five instructions.  */
+  if (expand_vec_perm_pslldq_psrldq_por (d, true))
+    return true;
+
   /* Even longer sequences.  */
   if (expand_vec_perm_vpshufb4_vpermq2 (d))
     return true;
@@ -21924,9 +22070,13 @@ canonicalize_perm (struct expand_vec_perm_d *d)
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
 
 bool
-ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
-			       rtx op1, const vec_perm_indices &sel)
+ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+			       rtx target, rtx op0, rtx op1,
+			       const vec_perm_indices &sel)
 {
+  if (vmode != op_mode)
+    return false;
+
   struct expand_vec_perm_d d;
   unsigned char perm[MAX_VECT_LEN];
   unsigned int i, nelt, which;
@@ -23237,9 +23387,10 @@ ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
   *rem_p = rem;
 }
 
-void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
-				       enum rtx_code code, bool after,
-				       bool doubleword)
+void
+ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
+				  enum rtx_code code, bool after,
+				  bool doubleword)
 {
   rtx old_reg, new_reg, old_mem, success;
   machine_mode mode = GET_MODE (target);
@@ -23283,10 +23434,11 @@ void ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
    it will be relaxed to an atomic load + compare, and skip
    cmpxchg instruction if mem != exp_input.  */
 
-void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
-			       rtx mem, rtx exp_input, rtx new_input,
-			       rtx mem_model, bool doubleword,
-			       rtx_code_label *loop_label)
+void
+ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
+			  rtx mem, rtx exp_input, rtx new_input,
+			  rtx mem_model, bool doubleword,
+			  rtx_code_label *loop_label)
 {
   rtx_code_label *cmp_label = NULL;
   rtx_code_label *done_label = NULL;
@@ -23385,6 +23537,7 @@ void ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
 
     /* If mem is not expected, pause and loop back.  */
     emit_label (cmp_label);
+    emit_move_insn (target_val, new_mem);
     emit_insn (gen_pause ());
     emit_jump_insn (gen_jump (loop_label));
     emit_barrier ();
diff --git a/gcc/config/i386/i386-expand.h b/gcc/config/i386/i386-expand.h
index 9d320c2..6c65019 100644
--- a/gcc/config/i386/i386-expand.h
+++ b/gcc/config/i386/i386-expand.h
@@ -48,8 +48,9 @@ rtx gen_push (rtx arg);
 rtx gen_pop (rtx arg);
 rtx ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
 			 machine_mode mode, int ignore);
-bool ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
-				    rtx op1, const vec_perm_indices &sel);
+bool ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+				    rtx target, rtx op0, rtx op1,
+				    const vec_perm_indices &sel);
 bool ix86_notrack_prefixed_insn_p (rtx_insn *);
 machine_mode ix86_split_reduction (machine_mode mode);
 void ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0,
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 6fe41c3..8908e42 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -711,8 +711,7 @@ gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
   switch (GET_MODE_NUNITS (vmode))
     {
     case 1:
-      /* We are not using this case currently.  */
-      gcc_unreachable ();
+      return gen_rtx_SUBREG (vmode, gpr, 0);
     case 2:
       return gen_rtx_VEC_CONCAT (vmode, gpr,
 				 CONST0_RTX (GET_MODE_INNER (vmode)));
@@ -932,6 +931,48 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
     }
 }
 
+/* Convert COMPARE to vector mode.  */
+
+rtx
+general_scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
+{
+  rtx tmp = gen_reg_rtx (vmode);
+  rtx src;
+  convert_op (&op1, insn);
+  /* Comparison against anything other than zero, requires an XOR.  */
+  if (op2 != const0_rtx)
+    {
+      convert_op (&op2, insn);
+      /* If both operands are MEMs, explicitly load the OP1 into TMP.  */
+      if (MEM_P (op1) && MEM_P (op2))
+	{
+	  emit_insn_before (gen_rtx_SET (tmp, op1), insn);
+	  src = tmp;
+	}
+      else
+	src = op1;
+      src = gen_rtx_XOR (vmode, src, op2);
+    }
+  else
+    src = op1;
+  emit_insn_before (gen_rtx_SET (tmp, src), insn);
+
+  if (vmode == V2DImode)
+    emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (tmp),
+						  copy_rtx_if_shared (tmp),
+						  copy_rtx_if_shared (tmp)),
+		      insn);
+  else if (vmode == V4SImode)
+    emit_insn_before (gen_sse2_pshufd (copy_rtx_if_shared (tmp),
+				       copy_rtx_if_shared (tmp),
+				       const0_rtx),
+		      insn);
+
+  return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (tmp),
+					       copy_rtx_if_shared (tmp)),
+			 UNSPEC_PTEST);
+}
+
 /* Convert INSN to vector mode.  */
 
 void
@@ -1090,19 +1131,8 @@ general_scalar_chain::convert_insn (rtx_insn *insn)
       break;
 
     case COMPARE:
-      src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
-
-      gcc_assert (REG_P (src) && GET_MODE (src) == DImode);
-      subreg = gen_rtx_SUBREG (V2DImode, src, 0);
-      emit_insn_before (gen_vec_interleave_lowv2di
-			(copy_rtx_if_shared (subreg),
-			 copy_rtx_if_shared (subreg),
-			 copy_rtx_if_shared (subreg)),
-			insn);
       dst = gen_rtx_REG (CCmode, FLAGS_REG);
-      src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (subreg),
-					       copy_rtx_if_shared (subreg)),
-			    UNSPEC_PTEST);
+      src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
       break;
 
     case CONST_INT:
@@ -1339,20 +1369,14 @@ pseudo_reg_set (rtx_insn *insn)
   return set;
 }
 
-/* Check if comparison INSN may be transformed
-   into vector comparison.  Currently we transform
-   zero checks only which look like:
-
-   (set (reg:CCZ 17 flags)
-        (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
-                             (subreg:SI (reg:DI x) 0))
-		     (const_int 0 [0])))  */
+/* Check if comparison INSN may be transformed into vector comparison.
+   Currently we transform equality/inequality checks which look like:
+   (set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y)))  */
 
 static bool
 convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
 {
-  /* ??? Currently convertible for double-word DImode chain only.  */
-  if (TARGET_64BIT || mode != DImode)
+  if (mode != (TARGET_64BIT ? TImode : DImode))
     return false;
 
   if (!TARGET_SSE4_1)
@@ -1375,31 +1399,14 @@ convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
   rtx op1 = XEXP (src, 0);
   rtx op2 = XEXP (src, 1);
 
-  if (op2 != CONST0_RTX (GET_MODE (op2)))
+  if (!CONST_INT_P (op1)
+      && ((!REG_P (op1) && !MEM_P (op1))
+	  || GET_MODE (op1) != mode))
     return false;
 
-  if (GET_CODE (op1) != IOR)
-    return false;
-
-  op2 = XEXP (op1, 1);
-  op1 = XEXP (op1, 0);
-
-  if (!SUBREG_P (op1)
-      || !SUBREG_P (op2)
-      || GET_MODE (op1) != SImode
-      || GET_MODE (op2) != SImode
-      || ((SUBREG_BYTE (op1) != 0
-	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
-	  && (SUBREG_BYTE (op2) != 0
-	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
-    return false;
-
-  op1 = SUBREG_REG (op1);
-  op2 = SUBREG_REG (op2);
-
-  if (op1 != op2
-      || !REG_P (op1)
-      || GET_MODE (op1) != DImode)
+  if (!CONST_INT_P (op2)
+      && ((!REG_P (op2) && !MEM_P (op2))
+	  || GET_MODE (op2) != mode))
     return false;
 
   return true;
diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h
index 5c30760..891cb46 100644
--- a/gcc/config/i386/i386-features.h
+++ b/gcc/config/i386/i386-features.h
@@ -181,6 +181,7 @@ class general_scalar_chain : public scalar_chain
   void convert_reg (rtx_insn *insn, rtx dst, rtx src);
   void make_vector_copies (rtx_insn *, rtx);
   void convert_registers ();
+  rtx convert_compare (rtx op1, rtx op2, rtx_insn *insn);
   int vector_const_cost (rtx exp);
 };
 
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 8055393..e11f681 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -138,6 +138,8 @@ along with GCC; see the file COPYING3.  If not see
 #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
 #define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
 
+#define m_LUJIAZUI (HOST_WIDE_INT_1U<<PROCESSOR_LUJIAZUI)
+
 #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
 #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
 #define m_K6_GEODE (m_K6 | m_GEODE)
@@ -755,6 +757,7 @@ static const struct processor_costs *processor_cost_table[] =
   &alderlake_cost,
   &icelake_cost,
   &intel_cost,
+  &lujiazui_cost,
   &geode_cost,
   &k6_cost,
   &athlon_cost,
@@ -2212,8 +2215,8 @@ ix86_option_override_internal (bool main_args_p,
   if (i == pta_size)
     {
       error (main_args_p
-	     ? G_("bad value (%qs) for %<-march=%> switch")
-	     : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
+	     ? G_("bad value %qs for %<-march=%> switch")
+	     : G_("bad value %qs for %<target(\"arch=\")%> attribute"),
 	     opts->x_ix86_arch_string);
 
       auto_vec <const char *> candidates;
@@ -2292,8 +2295,8 @@ ix86_option_override_internal (bool main_args_p,
   if (ix86_tune_specified && i == pta_size)
     {
       error (main_args_p
-	     ? G_("bad value (%qs) for %<-mtune=%> switch")
-	     : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
+	     ? G_("bad value %qs for %<-mtune=%> switch")
+	     : G_("bad value %qs for %<target(\"tune=\")%> attribute"),
 	     opts->x_ix86_tune_string);
 
       auto_vec <const char *> candidates;
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d77ad83..3d189e1 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -3706,7 +3706,7 @@ zero_call_used_regno_mode (const unsigned int regno)
   else if (MASK_REGNO_P (regno))
     return HImode;
   else if (MMX_REGNO_P (regno))
-    return V4HImode;
+    return V2SImode;
   else
     gcc_unreachable ();
 }
@@ -3753,16 +3753,17 @@ zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
    needs to be cleared, the whole stack should be cleared.  However,
    x87 stack registers that hold the return value should be excluded.
    x87 returns in the top (two for complex values) register, so
-   num_of_st should be 7/6 when x87 returns, otherwise it will be 8.  */
+   num_of_st should be 7/6 when x87 returns, otherwise it will be 8.
+   return the value of num_of_st.  */
 
 
-static bool
+static int
 zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
 {
 
   /* If the FPU is disabled, no need to zero all st registers.  */
   if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
-    return false;
+    return 0;
 
   unsigned int num_of_st = 0;
   for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
@@ -3774,7 +3775,7 @@ zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
       }
 
   if (num_of_st == 0)
-    return false;
+    return 0;
 
   bool return_with_x87 = false;
   return_with_x87 = (crtl->return_rtx
@@ -3802,7 +3803,7 @@ zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
       insn = emit_insn (gen_rtx_SET (st_reg, st_reg));
       add_reg_note (insn, REG_DEAD, st_reg);
     }
-  return true;
+  return num_of_st;
 }
 
 
@@ -3825,19 +3826,12 @@ zero_all_mm_registers (HARD_REG_SET need_zeroed_hardregs,
   if (!need_zero_all_mm)
     return false;
 
-  rtx zero_mmx = NULL_RTX;
-  machine_mode mode = V4HImode;
+  machine_mode mode = V2SImode;
   for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++)
     if (regno != ret_mmx_regno)
       {
 	rtx reg = gen_rtx_REG (mode, regno);
-	if (zero_mmx == NULL_RTX)
-	  {
-	    zero_mmx = reg;
-	    emit_insn (gen_rtx_SET (reg, CONST0_RTX (mode)));
-	  }
-	else
-	  emit_move_insn (reg, zero_mmx);
+	emit_insn (gen_rtx_SET (reg, CONST0_RTX (mode)));
       }
   return true;
 }
@@ -3851,7 +3845,7 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs)
 {
   HARD_REG_SET zeroed_hardregs;
   bool all_sse_zeroed = false;
-  bool all_st_zeroed = false;
+  int all_st_zeroed_num = 0;
   bool all_mm_zeroed = false;
 
   CLEAR_HARD_REG_SET (zeroed_hardregs);
@@ -3881,9 +3875,17 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs)
   if (!exit_with_mmx_mode)
     /* x87 exit mode, we should zero all st registers together.  */
     {
-      all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs);
-      if (all_st_zeroed)
-	SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG);
+      all_st_zeroed_num = zero_all_st_registers (need_zeroed_hardregs);
+
+      if (all_st_zeroed_num > 0)
+	for (unsigned int regno = FIRST_STACK_REG; regno <= LAST_STACK_REG; regno++)
+	  /* x87 stack registers that hold the return value should be excluded.
+	     x87 returns in the top (two for complex values) register.  */
+	  if (all_st_zeroed_num == 8
+	      || !((all_st_zeroed_num >= 6 && regno == REGNO (crtl->return_rtx))
+		   || (all_st_zeroed_num == 6
+		       && (regno == (REGNO (crtl->return_rtx) + 1)))))
+	    SET_HARD_REG_BIT (zeroed_hardregs, regno);
     }
   else
     /* MMX exit mode, check whether we can zero all mm registers.  */
@@ -3899,11 +3901,6 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs)
 
   /* Now, generate instructions to zero all the other registers.  */
 
-  rtx zero_gpr = NULL_RTX;
-  rtx zero_vector = NULL_RTX;
-  rtx zero_mask = NULL_RTX;
-  rtx zero_mmx = NULL_RTX;
-
   for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
     {
       if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
@@ -3914,59 +3911,34 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs)
 
       SET_HARD_REG_BIT (zeroed_hardregs, regno);
 
-      rtx reg, tmp, zero_rtx;
       machine_mode mode = zero_call_used_regno_mode (regno);
 
-      reg = gen_rtx_REG (mode, regno);
-      zero_rtx = CONST0_RTX (mode);
+      rtx reg = gen_rtx_REG (mode, regno);
+      rtx tmp = gen_rtx_SET (reg, CONST0_RTX (mode));
 
-      if (mode == SImode)
-	if (zero_gpr == NULL_RTX)
-	  {
-	    zero_gpr = reg;
-	    tmp = gen_rtx_SET (reg, zero_rtx);
-	    if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
-	      {
-		rtx clob = gen_rtx_CLOBBER (VOIDmode,
-					    gen_rtx_REG (CCmode,
-							 FLAGS_REG));
-		tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2,
-							     tmp,
-							     clob));
-	      }
-	    emit_insn (tmp);
-	  }
-	else
-	  emit_move_insn (reg, zero_gpr);
-      else if (mode == V4SFmode)
-	if (zero_vector == NULL_RTX)
-	  {
-	    zero_vector = reg;
-	    tmp = gen_rtx_SET (reg, zero_rtx);
-	    emit_insn (tmp);
-	  }
-	else
-	  emit_move_insn (reg, zero_vector);
-      else if (mode == HImode)
-	if (zero_mask == NULL_RTX)
-	  {
-	    zero_mask = reg;
-	    tmp = gen_rtx_SET (reg, zero_rtx);
-	    emit_insn (tmp);
-	  }
-	else
-	  emit_move_insn (reg, zero_mask);
-      else if (mode == V4HImode)
-	if (zero_mmx == NULL_RTX)
-	  {
-	    zero_mmx = reg;
-	    tmp = gen_rtx_SET (reg, zero_rtx);
-	    emit_insn (tmp);
-	  }
-	else
-	  emit_move_insn (reg, zero_mmx);
-      else
-	gcc_unreachable ();
+      switch (mode)
+	{
+	case E_SImode:
+	  if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
+	    {
+	      rtx clob = gen_rtx_CLOBBER (VOIDmode,
+					  gen_rtx_REG (CCmode,
+						       FLAGS_REG));
+	      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2,
+							   tmp,
+							   clob));
+	    }
+	  /* FALLTHRU.  */
+
+	case E_V4SFmode:
+	case E_HImode:
+	case E_V2SImode:
+	  emit_insn (tmp);
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
     }
   return zeroed_hardregs;
 }
@@ -4882,6 +4854,7 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
 	{
 	  int i, prev_size = 0;
 	  tree temp = create_tmp_var (type, "va_arg_tmp");
+	  TREE_ADDRESSABLE (temp) = 1;
 
 	  /* addr = &temp; */
 	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
@@ -6515,7 +6488,8 @@ ix86_initial_elimination_offset (int from, int to)
 }
 
 /* Emits a warning for unsupported msabi to sysv pro/epilogues.  */
-void warn_once_call_ms2sysv_xlogues (const char *feature)
+void
+warn_once_call_ms2sysv_xlogues (const char *feature)
 {
   static bool warned_once = false;
   if (!warned_once)
@@ -16846,7 +16820,8 @@ int
 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
 			      bool has_vex_w)
 {
-  int i;
+  int i, reg_only = 2 + 1;
+  bool has_mem = false;
 
   /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
      byte VEX prefix.  */
@@ -16866,16 +16841,23 @@ ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
 	if (GET_MODE (recog_data.operand[i]) == DImode
 	    && GENERAL_REG_P (recog_data.operand[i]))
 	  return 3 + 1;
+
+	/* REX.B bit requires 3-byte VEX. Right here we don't know which
+	   operand will be encoded using VEX.B, so be conservative.  */
+	if (REX_INT_REGNO_P (recog_data.operand[i])
+	    || REX_SSE_REGNO_P (recog_data.operand[i]))
+	  reg_only = 3 + 1;
       }
-    else
+    else if (MEM_P (recog_data.operand[i]))
       {
 	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
-	if (MEM_P (recog_data.operand[i])
-	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
+	if (x86_extended_reg_mentioned_p (recog_data.operand[i]))
 	  return 3 + 1;
+
+	has_mem = true;
       }
 
-  return 2 + 1;
+  return has_mem ? 2 + 1 : reg_only;
 }
 
 
@@ -18368,10 +18350,16 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	}
       break;
 
+    case IX86_BUILTIN_BLENDVPD:
+      /* blendvpd is under sse4.1 but pcmpgtq is under sse4.2,
+	 w/o sse4.2, it's veclowered to scalar operations and
+	 not combined back.  */
+      if (!TARGET_SSE4_2)
+	break;
+      /* FALLTHRU.  */
     case IX86_BUILTIN_PBLENDVB128:
     case IX86_BUILTIN_PBLENDVB256:
     case IX86_BUILTIN_BLENDVPS:
-    case IX86_BUILTIN_BLENDVPD:
     case IX86_BUILTIN_BLENDVPS256:
     case IX86_BUILTIN_BLENDVPD256:
       gcc_assert (n_args == 3);
@@ -18791,7 +18779,8 @@ ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
       return NULL_TREE;
     }
 
-  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
+  tree fndecl = mathfn_built_in (el_mode == DFmode
+				 ? double_type_node : float_type_node, fn);
   bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
 
   if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
@@ -18883,7 +18872,8 @@ ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
       return NULL_TREE;
     }
 
-  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
+  tree fndecl = mathfn_built_in (el_mode == DFmode
+				 ? double_type_node : float_type_node, fn);
   bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
   sprintf (name + 7, "%s", bname+10);
 
@@ -20159,6 +20149,18 @@ ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
     return (GET_MODE_SIZE (mode1) == 8
 	    && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
 
+  /* SCmode and DImode can be tied.  */
+  if ((mode1 == E_SCmode && mode2 == E_DImode)
+      || (mode1 == E_DImode && mode2 == E_SCmode))
+    return TARGET_64BIT;
+
+  /* [SD]Cmode and V2[SD]Fmode modes can be tied.  */
+  if ((mode1 == E_SCmode && mode2 == E_V2SFmode)
+      || (mode1 == E_V2SFmode && mode2 == E_SCmode)
+      || (mode1 == E_DCmode && mode2 == E_V2DFmode)
+      || (mode1 == E_V2DFmode && mode2 == E_DCmode))
+    return true;
+
   return false;
 }
 
@@ -20652,7 +20654,17 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
 	        op0 = XEXP (op0, 0), mode = GET_MODE (op0);
 	    }
 
-  	  *total = (cost->mult_init[MODE_INDEX (mode)]
+	  int mult_init;
+	  // Double word multiplication requires 3 mults and 2 adds.
+	  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	    {
+	      mult_init = 3 * cost->mult_init[MODE_INDEX (word_mode)]
+			  + 2 * cost->add;
+	      nbits *= 3;
+	    }
+	  else mult_init = cost->mult_init[MODE_INDEX (mode)];
+
+  	  *total = (mult_init
 		    + nbits * cost->mult_bit
 	            + rtx_cost (op0, mode, outer_code, opno, speed)
 		    + rtx_cost (op1, mode, outer_code, opno, speed));
@@ -20746,70 +20758,125 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
 	}
 
       if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
-	{
-	  *total = cost->addss;
-	  return false;
-	}
+	*total = cost->addss;
       else if (X87_FLOAT_MODE_P (mode))
-	{
-	  *total = cost->fadd;
-	  return false;
-	}
+	*total = cost->fadd;
       else if (FLOAT_MODE_P (mode))
-	{
-	  *total = ix86_vec_cost (mode, cost->addss);
-	  return false;
-	}
-      /* FALLTHRU */
+	*total = ix86_vec_cost (mode, cost->addss);
+      else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+	*total = ix86_vec_cost (mode, cost->sse_op);
+      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	*total = cost->add * 2;
+      else
+	*total = cost->add;
+      return false;
 
-    case AND:
     case IOR:
     case XOR:
-      if (GET_MODE_CLASS (mode) == MODE_INT
-	  && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+	*total = ix86_vec_cost (mode, cost->sse_op);
+      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	*total = cost->add * 2;
+      else
+	*total = cost->add;
+      return false;
+
+    case AND:
+      if (address_no_seg_operand (x, mode))
 	{
-	  *total = (cost->add * 2
-		    + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
-		       << (GET_MODE (XEXP (x, 0)) != DImode))
-		    + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
-	               << (GET_MODE (XEXP (x, 1)) != DImode)));
+	  *total = cost->lea;
 	  return true;
 	}
-      else if (code == AND
-	       && address_no_seg_operand (x, mode))
+      else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
 	{
-	  *total = cost->lea;
-	  return true;
+	  /* pandn is a single instruction.  */
+	  if (GET_CODE (XEXP (x, 0)) == NOT)
+	    {
+	      *total = ix86_vec_cost (mode, cost->sse_op)
+		       + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+				   outer_code, opno, speed)
+		       + rtx_cost (XEXP (x, 1), mode,
+				   outer_code, opno, speed);
+	      return true;
+	    }
+	  else if (GET_CODE (XEXP (x, 1)) == NOT)
+	    {
+	      *total = ix86_vec_cost (mode, cost->sse_op)
+		       + rtx_cost (XEXP (x, 0), mode,
+				   outer_code, opno, speed)
+		       + rtx_cost (XEXP (XEXP (x, 1), 0), mode,
+				   outer_code, opno, speed);
+	      return true;
+	    }
+	  *total = ix86_vec_cost (mode, cost->sse_op);
 	}
-      /* FALLTHRU */
-
-    case NEG:
-      if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
+      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
 	{
-	  *total = cost->sse_op;
-	  return false;
+	  if (TARGET_BMI && GET_CODE (XEXP (x,0)) == NOT)
+	    {
+	      *total = cost->add * 2
+		       + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+				   outer_code, opno, speed)
+		       + rtx_cost (XEXP (x, 1), mode,
+				   outer_code, opno, speed);
+	      return true;
+	    }
+	  else if (TARGET_BMI && GET_CODE (XEXP (x, 1)) == NOT)
+	    {
+	      *total = cost->add * 2
+		       + rtx_cost (XEXP (x, 0), mode,
+				   outer_code, opno, speed)
+		       + rtx_cost (XEXP (XEXP (x, 1), 0), mode,
+				   outer_code, opno, speed);
+	      return true;
+	    }
+	  *total = cost->add * 2;
 	}
-      else if (X87_FLOAT_MODE_P (mode))
+      else if (TARGET_BMI && GET_CODE (XEXP (x,0)) == NOT)
 	{
-	  *total = cost->fchs;
-	  return false;
+	  *total = cost->add
+		   + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+			       outer_code, opno, speed)
+		   + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
+	  return true;
 	}
-      else if (FLOAT_MODE_P (mode))
+      else if (TARGET_BMI && GET_CODE (XEXP (x,1)) == NOT)
 	{
-	  *total = ix86_vec_cost (mode, cost->sse_op);
-	  return false;
+	  *total = cost->add
+		   + rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+		   + rtx_cost (XEXP (XEXP (x, 1), 0), mode,
+			       outer_code, opno, speed);
+	  return true;
 	}
-      /* FALLTHRU */
+      else
+	*total = cost->add;
+      return false;
 
     case NOT:
       if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-	*total = ix86_vec_cost (mode, cost->sse_op);
+	// vnot is pxor -1.
+	*total = ix86_vec_cost (mode, cost->sse_op) + 1;
       else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
 	*total = cost->add * 2;
       else
 	*total = cost->add;
       return false;
 
+    case NEG:
+      if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
+	*total = cost->sse_op;
+      else if (X87_FLOAT_MODE_P (mode))
+	*total = cost->fchs;
+      else if (FLOAT_MODE_P (mode))
+	*total = ix86_vec_cost (mode, cost->sse_op);
+      else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+	*total = ix86_vec_cost (mode, cost->sse_op);
+      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	*total = cost->add * 3;
+      else
+	*total = cost->add;
+      return false;
+
     case COMPARE:
       rtx op0, op1;
       op0 = XEXP (x, 0);
@@ -20936,6 +21003,51 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
         *total += 1;
       return false;
 
+    case ZERO_EXTRACT:
+      if (XEXP (x, 1) == const1_rtx
+	  && GET_CODE (XEXP (x, 2)) == ZERO_EXTEND
+	  && GET_MODE (XEXP (x, 2)) == SImode
+	  && GET_MODE (XEXP (XEXP (x, 2), 0)) == QImode)
+	{
+	  /* Ignore cost of zero extension and masking of last argument.  */
+	  *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+	  *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+	  *total += rtx_cost (XEXP (XEXP (x, 2), 0), mode, code, 2, speed);
+	  return true;
+	}
+      return false;
+
+    case IF_THEN_ELSE:
+      if (TARGET_XOP
+	  && VECTOR_MODE_P (mode)
+	  && (GET_MODE_SIZE (mode) == 16 || GET_MODE_SIZE (mode) == 32))
+	{
+	  /* vpcmov.  */
+	  *total = speed ? COSTS_N_INSNS (2) : COSTS_N_BYTES (6);
+	  if (!REG_P (XEXP (x, 0)))
+	    *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+	  if (!REG_P (XEXP (x, 1)))
+	    *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+	  if (!REG_P (XEXP (x, 2)))
+	    *total += rtx_cost (XEXP (x, 2), mode, code, 2, speed);
+	  return true;
+	}
+      else if (TARGET_CMOVE
+	       && SCALAR_INT_MODE_P (mode)
+	       && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
+	{
+	  /* cmov.  */
+	  *total = COSTS_N_INSNS (1);
+	  if (!REG_P (XEXP (x, 0)))
+	    *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+	  if (!REG_P (XEXP (x, 1)))
+	    *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed);
+	  if (!REG_P (XEXP (x, 2)))
+	    *total += rtx_cost (XEXP (x, 2), mode, code, 2, speed);
+	  return true;
+	}
+      return false;
+
     default:
       return false;
     }
@@ -21927,6 +22039,65 @@ ix86_seh_fixup_eh_fallthru (void)
       emit_insn_after (gen_nops (const1_rtx), insn);
     }
 }
+/* Split vector load from parm_decl to elemental loads to avoid STLF
+   stalls.  */
+static void
+ix86_split_stlf_stall_load ()
+{
+  rtx_insn* insn, *start = get_insns ();
+  unsigned window = 0;
+
+  for (insn = start; insn; insn = NEXT_INSN (insn))
+    {
+      if (!NONDEBUG_INSN_P (insn))
+	continue;
+      window++;
+      /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
+	 other, just emulate for pipeline) before stalled load, stlf stall
+	 case is as fast as no stall cases on CLX.
+	 Since CFG is freed before machine_reorg, just do a rough
+	 calculation of the window according to the layout.  */
+      if (window > (unsigned) x86_stlf_window_ninsns)
+	return;
+
+      if (any_uncondjump_p (insn)
+	  || ANY_RETURN_P (PATTERN (insn))
+	  || CALL_P (insn))
+	return;
+
+      rtx set = single_set (insn);
+      if (!set)
+	continue;
+      rtx src = SET_SRC (set);
+      if (!MEM_P (src)
+	  /* Only handle V2DFmode load since it doesn't need any scratch
+	     register.  */
+	  || GET_MODE (src) != E_V2DFmode
+	  || !MEM_EXPR (src)
+	  || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)
+	continue;
+
+      rtx zero = CONST0_RTX (V2DFmode);
+      rtx dest = SET_DEST (set);
+      rtx m = adjust_address (src, DFmode, 0);
+      rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
+      emit_insn_before (loadlpd, insn);
+      m = adjust_address (src, DFmode, 8);
+      rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fputs ("Due to potential STLF stall, split instruction:\n",
+		 dump_file);
+	  print_rtl_single (dump_file, insn);
+	  fputs ("To:\n", dump_file);
+	  print_rtl_single (dump_file, loadlpd);
+	  print_rtl_single (dump_file, loadhpd);
+	}
+      PATTERN (insn) = loadhpd;
+      INSN_CODE (insn) = -1;
+      gcc_assert (recog_memoized (insn) != -1);
+    }
+}
 
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
@@ -21942,6 +22113,8 @@ ix86_reorg (void)
 
   if (optimize && optimize_function_for_speed_p (cfun))
     {
+      if (TARGET_SSE2)
+	ix86_split_stlf_stall_load ();
       if (TARGET_PAD_SHORT_FUNCTION)
 	ix86_pad_short_function ();
       else if (TARGET_PAD_RETURNS)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 0d28e57..f16df63 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -390,6 +390,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
 #define TARGET_AVOID_4BYTE_PREFIXES \
 	ix86_tune_features[X86_TUNE_AVOID_4BYTE_PREFIXES]
+#define TARGET_USE_GATHER_2PARTS \
+	ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS]
+#define TARGET_USE_GATHER_4PARTS \
+	ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS]
 #define TARGET_USE_GATHER \
 	ix86_tune_features[X86_TUNE_USE_GATHER]
 #define TARGET_FUSE_CMP_AND_BRANCH_32 \
@@ -2239,6 +2243,7 @@ enum processor_type
   PROCESSOR_ALDERLAKE,
   PROCESSOR_ROCKETLAKE,
   PROCESSOR_INTEL,
+  PROCESSOR_LUJIAZUI,
   PROCESSOR_GEODE,
   PROCESSOR_K6,
   PROCESSOR_ATHLON,
@@ -2323,10 +2328,11 @@ constexpr wide_int_bitmask PTA_ICELAKE_SERVER = PTA_ICELAKE_CLIENT
   | PTA_PCONFIG | PTA_WBNOINVD | PTA_CLWB;
 constexpr wide_int_bitmask PTA_TIGERLAKE = PTA_ICELAKE_CLIENT | PTA_MOVDIRI
   | PTA_MOVDIR64B | PTA_CLWB | PTA_AVX512VP2INTERSECT | PTA_KL | PTA_WIDEKL;
-constexpr wide_int_bitmask PTA_SAPPHIRERAPIDS = PTA_COOPERLAKE | PTA_MOVDIRI
+constexpr wide_int_bitmask PTA_SAPPHIRERAPIDS = PTA_ICELAKE_SERVER | PTA_MOVDIRI
   | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_ENQCMD | PTA_CLDEMOTE
   | PTA_PTWRITE | PTA_WAITPKG | PTA_SERIALIZE | PTA_TSXLDTRK | PTA_AMX_TILE
-  | PTA_AMX_INT8 | PTA_AMX_BF16 | PTA_UINTR | PTA_AVXVNNI | PTA_AVX512FP16;
+  | PTA_AMX_INT8 | PTA_AMX_BF16 | PTA_UINTR | PTA_AVXVNNI | PTA_AVX512FP16
+  | PTA_AVX512BF16;
 constexpr wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF
   | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD | PTA_PREFETCHWT1;
 constexpr wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE;
@@ -2848,10 +2854,10 @@ extern enum attr_cpu ix86_schedule;
 #define NUM_X86_64_MS_CLOBBERED_REGS 12
 #endif
 
-/* __builtin_eh_return can't handle stack realignment, so restrict to
-   general regs in 32-bit libgcc functions that call it.  */
+/* __builtin_eh_return can't handle stack realignment, so disable MMX/SSE
+   in 32-bit libgcc functions that call it.  */
 #ifndef __x86_64__
-#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((target ("general-regs-only")))
+#define LIBGCC2_UNWIND_ATTRIBUTE __attribute__((target ("no-mmx,no-sse")))
 #endif
 
 /*
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 46a2663..3093cb5 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -473,8 +473,8 @@
 
 ;; Processor type.
 (define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem,
-		    atom,slm,glm,haswell,generic,amdfam10,bdver1,bdver2,bdver3,
-		    bdver4,btver2,znver1,znver2,znver3"
+		    atom,slm,glm,haswell,generic,lujiazui,amdfam10,bdver1,
+		    bdver2,bdver3,bdver4,btver2,znver1,znver2,znver3"
   (const (symbol_ref "ix86_schedule")))
 
 ;; A basic instruction type.  Refinements due to arguments to be
@@ -1310,6 +1310,7 @@
 (include "glm.md")
 (include "core2.md")
 (include "haswell.md")
+(include "lujiazui.md")
 
 
 ;; Operand and operator predicates and constraints
@@ -1338,16 +1339,38 @@
   DONE;
 })
 
+(define_expand "cbranchoi4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:OI 1 "nonimmediate_operand")
+		    (match_operand:OI 2 "nonimmediate_operand")))
+   (set (pc) (if_then_else
+	       (match_operator 0 "bt_comparison_operator"
+		[(reg:CC FLAGS_REG) (const_int 0)])
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  "TARGET_AVX"
+{
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
 (define_expand "cstore<mode>4"
   [(set (reg:CC FLAGS_REG)
-	(compare:CC (match_operand:SWIM 2 "nonimmediate_operand")
-		    (match_operand:SWIM 3 "<general_operand>")))
+	(compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
+		    (match_operand:SDWIM 3 "<general_operand>")))
    (set (match_operand:QI 0 "register_operand")
 	(match_operator 1 "ordered_comparison_operator"
 	  [(reg:CC FLAGS_REG) (const_int 0)]))]
   ""
 {
-  if (MEM_P (operands[2]) && MEM_P (operands[3]))
+  if (<MODE>mode == (TARGET_64BIT ? TImode : DImode))
+    {
+      if (GET_CODE (operands[1]) != EQ
+	  && GET_CODE (operands[1]) != NE)
+	FAIL;
+    }
+  else if (MEM_P (operands[2]) && MEM_P (operands[3]))
     operands[2] = force_reg (<MODE>mode, operands[2]);
   ix86_expand_setcc (operands[0], GET_CODE (operands[1]),
 		     operands[2], operands[3]);
@@ -1483,6 +1506,52 @@
   [(set_attr "type" "icmp")
    (set_attr "mode" "QI")])
 
+(define_insn_and_split "*cmp<dwi>_doubleword"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ (match_operand:<DWI> 0 "nonimmediate_operand")
+		     (match_operand:<DWI> 1 "x86_64_general_operand")))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+		   (compare:CCZ (ior:DWIH (match_dup 4) (match_dup 5))
+				(const_int 0)))
+	      (set (match_dup 4) (ior:DWIH (match_dup 4) (match_dup 5)))])]
+{
+  split_double_mode (<DWI>mode, &operands[0], 2, &operands[0], &operands[2]);
+  /* Placing the SUBREG pieces in pseudos helps reload.  */
+  for (int i = 0; i < 4; i++)
+    if (SUBREG_P (operands[i]))
+      operands[i] = force_reg (<MODE>mode, operands[i]);
+
+  operands[4] = gen_reg_rtx (<MODE>mode);
+  if (operands[1] == const0_rtx)
+    emit_move_insn (operands[4], operands[0]);
+  else if (operands[0] == const0_rtx)
+    emit_move_insn (operands[4], operands[1]);
+  else if (operands[1] == constm1_rtx)
+    emit_insn (gen_one_cmpl<mode>2 (operands[4], operands[0]));
+  else if (operands[0] == constm1_rtx)
+    emit_insn (gen_one_cmpl<mode>2 (operands[4], operands[1]));
+  else
+    emit_insn (gen_xor<mode>3 (operands[4], operands[0], operands[1]));
+
+  if (operands[3] == const0_rtx)
+    operands[5] = operands[2];
+  else if (operands[2] == const0_rtx)
+    operands[5] = operands[3];
+  else
+    {
+      operands[5] = gen_reg_rtx (<MODE>mode);
+      if (operands[3] == constm1_rtx)
+	emit_insn (gen_one_cmpl<mode>2 (operands[5], operands[2]));
+      else if (operands[2] == constm1_rtx)
+	emit_insn (gen_one_cmpl<mode>2 (operands[5], operands[3]));
+      else
+	emit_insn (gen_xor<mode>3 (operands[5], operands[2], operands[3]));
+    }
+})
+
 ;; These implement float point compares.
 ;; %%% See if we can get away with VOIDmode operands on the actual insns,
 ;; which would allow mix and match FP modes on the compares.  Which is what
@@ -2182,9 +2251,9 @@
 
 (define_insn "*movdi_internal"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,m,?r ,?*Yd,?r,?*v,?*y,?*x,*k,*k  ,*r,*m,*k")
+    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r,?*y,?v,?v,?v,m ,m,?r ,?*Yd,?r,?v,?*y,?*x,*k,*k  ,*r,*m,*k")
 	(match_operand:DI 1 "general_operand"
-    "riFo,riF,Z,rem,i,re,C ,*y,Bk ,*y,*y,r  ,C ,*v,Bk,*v,v,*Yd,r   ,*v,r  ,*x ,*y ,*r,*kBk,*k,*k,CBC"))]
+    "riFo,riF,Z,rem,i,re,C ,*y,Bk ,*y,*y,r  ,C ,?v,Bk,?v,v,*Yd,r   ,?v,r  ,*x ,*y ,*r,*kBk,*k,*k,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
    && ix86_hardreg_mov_ok (operands[0], operands[1])"
 {
@@ -2403,9 +2472,9 @@
 
 (define_insn "*movsi_internal"
   [(set (match_operand:SI 0 "nonimmediate_operand"
-    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k  ,*rm,*k")
+    "=r,m ,*y,*y,?*y,?m,?r,?*y,?v,?v,?v,m ,?r,?v,*k,*k  ,*rm,*k")
 	(match_operand:SI 1 "general_operand"
-    "g ,re,C ,*y,Bk ,*y,*y,r  ,C ,*v,Bk,*v,*v,r  ,*r,*kBk,*k ,CBC"))]
+    "g ,re,C ,*y,Bk ,*y,*y,r  ,C ,?v,Bk,?v,?v,r  ,*r,*kBk,*k ,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
    && ix86_hardreg_mov_ok (operands[0], operands[1])"
 {
@@ -2658,7 +2727,7 @@
 		    (const_string "TI"))
 	    (eq_attr "alternative" "12")
 	      (cond [(match_test "TARGET_AVX512FP16")
-		       (const_string "HI")
+		       (const_string "HF")
 		     (match_test "TARGET_AVX")
 		       (const_string "TI")
 		     (ior (not (match_test "TARGET_SSE2"))
@@ -3455,9 +3524,7 @@
    && !(MEM_P (operands[0]) && MEM_P (operands[1]))
    && (lra_in_progress || reload_completed
        || !CONST_DOUBLE_P (operands[1])
-       || ((optimize_function_for_size_p (cfun)
-	    || (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC))
-	   && standard_sse_constant_p (operands[1], TFmode) == 1
+       || (standard_sse_constant_p (operands[1], TFmode) == 1
 	   && !memory_operand (operands[0], TFmode))
        || (!TARGET_MEMORY_MISMATCH_STALL
 	   && memory_operand (operands[0], TFmode)))"
@@ -3590,10 +3657,11 @@
        || !CONST_DOUBLE_P (operands[1])
        || ((optimize_function_for_size_p (cfun)
 	    || (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC))
-	   && ((IS_STACK_MODE (DFmode)
-		&& standard_80387_constant_p (operands[1]) > 0)
-	       || (TARGET_SSE2 && TARGET_SSE_MATH
-		   && standard_sse_constant_p (operands[1], DFmode) == 1))
+	   && IS_STACK_MODE (DFmode)
+	   && standard_80387_constant_p (operands[1]) > 0
+	   && !memory_operand (operands[0], DFmode))
+       || (TARGET_SSE2 && TARGET_SSE_MATH
+	   && standard_sse_constant_p (operands[1], DFmode) == 1
 	   && !memory_operand (operands[0], DFmode))
        || ((TARGET_64BIT || !TARGET_MEMORY_MISMATCH_STALL)
 	   && memory_operand (operands[0], DFmode))
@@ -3762,10 +3830,10 @@
        || !CONST_DOUBLE_P (operands[1])
        || ((optimize_function_for_size_p (cfun)
 	    || (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC))
-	   && ((IS_STACK_MODE (SFmode)
-		&& standard_80387_constant_p (operands[1]) > 0)
-	       || (TARGET_SSE && TARGET_SSE_MATH
-		   && standard_sse_constant_p (operands[1], SFmode) == 1)))
+	   && IS_STACK_MODE (SFmode)
+	   && standard_80387_constant_p (operands[1]) > 0)
+       || (TARGET_SSE && TARGET_SSE_MATH
+	   && standard_sse_constant_p (operands[1], SFmode) == 1)
        || memory_operand (operands[0], SFmode)
        || !TARGET_HARD_SF_REGS)"
 {
@@ -6282,7 +6350,7 @@
   [(set (reg FLAGS_REG)
 	(compare
 	  (match_operand:SWI124 1 "nonimmediate_operand" "0")
-	  (match_operand:SWI124 2 "const_int_operand" "n")))
+	  (match_operand:SWI124 2 "const_int_operand")))
    (clobber (match_scratch:SWI124 0 "=<r>"))]
   "ix86_match_ccmode (insn, CCGCmode)"
 {
@@ -6488,7 +6556,7 @@
 	(eq:CCO (plus:<DWI>
 		   (sign_extend:<DWI>
 		      (match_operand:SWI 1 "nonimmediate_operand" "0"))
-		   (match_operand:<DWI> 3 "const_int_operand" "i"))
+		   (match_operand:<DWI> 3 "const_int_operand"))
 		(sign_extend:<DWI>
 		   (plus:SWI
 		     (match_dup 1)
@@ -6561,7 +6629,7 @@
 	  (plus:<QPWI>
 	    (sign_extend:<QPWI>
 	      (match_operand:<DWI> 1 "nonimmediate_operand" "%0"))
-	    (match_operand:<QPWI> 3 "const_scalar_int_operand" ""))
+	    (match_operand:<QPWI> 3 "const_scalar_int_operand" "n"))
 	  (sign_extend:<QPWI>
 	    (plus:<DWI>
 	      (match_dup 1)
@@ -6646,7 +6714,7 @@
 		[(match_operand 3 "flags_reg_operand") (const_int 0)])
 	      (sign_extend:<DWI>
 		(match_operand:SWI 1 "nonimmediate_operand" "%0")))
-	    (match_operand:<DWI> 6 "const_int_operand" ""))
+	    (match_operand:<DWI> 6 "const_int_operand" "n"))
 	  (sign_extend:<DWI>
 	    (plus:SWI
 	      (plus:SWI
@@ -6811,8 +6879,8 @@
 	(any_or:SWI12
 	  (ashift:SWI12
 	    (match_operand:SWI12 1 "index_register_operand" "l")
-	    (match_operand 2 "const_0_to_3_operand" "n"))
-	  (match_operand 3 "const_int_operand" "n")))]
+	    (match_operand 2 "const_0_to_3_operand"))
+	  (match_operand 3 "const_int_operand")))]
   "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
    && ((unsigned HOST_WIDE_INT) INTVAL (operands[3])
        < (HOST_WIDE_INT_1U << INTVAL (operands[2])))"
@@ -6835,8 +6903,8 @@
 	(any_or:SWI48
 	  (ashift:SWI48
 	    (match_operand:SWI48 1 "index_register_operand" "l")
-	    (match_operand 2 "const_0_to_3_operand" "n"))
-	  (match_operand 3 "const_int_operand" "n")))]
+	    (match_operand 2 "const_0_to_3_operand"))
+	  (match_operand 3 "const_int_operand")))]
   "(unsigned HOST_WIDE_INT) INTVAL (operands[3])
    < (HOST_WIDE_INT_1U << INTVAL (operands[2]))"
   "#"
@@ -7032,7 +7100,7 @@
 	(eq:CCO (minus:<DWI>
 		   (sign_extend:<DWI>
 		      (match_operand:SWI 1 "nonimmediate_operand" "0"))
-		   (match_operand:<DWI> 3 "const_int_operand" "i"))
+		   (match_operand:<DWI> 3 "const_int_operand"))
 		(sign_extend:<DWI>
 		   (minus:SWI
 		     (match_dup 1)
@@ -7100,7 +7168,7 @@
 	  (minus:<QPWI>
 	    (sign_extend:<QPWI>
 	      (match_operand:<DWI> 1 "nonimmediate_operand" "0"))
-	    (match_operand:<QPWI> 3 "const_scalar_int_operand" ""))
+	    (match_operand:<QPWI> 3 "const_scalar_int_operand"))
 	  (sign_extend:<QPWI>
 	    (minus:<DWI>
 	      (match_dup 1)
@@ -7183,7 +7251,7 @@
 		(match_operand:SWI 1 "nonimmediate_operand" "%0"))
 	      (match_operator:<DWI> 4 "ix86_carry_flag_operator"
 		[(match_operand 3 "flags_reg_operand") (const_int 0)]))
-	    (match_operand:<DWI> 6 "const_int_operand" ""))
+	    (match_operand:<DWI> 6 "const_int_operand" "n"))
 	  (sign_extend:<DWI>
 	    (minus:SWI
 	      (minus:SWI
@@ -7453,7 +7521,7 @@
 		(match_operand:SWI48 1 "nonimmediate_operand" "%0"))
 	      (match_operand:SWI48 2 "x86_64_immediate_operand" "e")))
 	  (plus:<DWI>
-	    (match_operand:<DWI> 6 "const_scalar_int_operand" "")
+	    (match_operand:<DWI> 6 "const_scalar_int_operand")
 	    (match_operator:<DWI> 4 "ix86_carry_flag_operator"
 	      [(match_dup 3) (const_int 0)]))))
    (set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
@@ -8447,9 +8515,9 @@
 	  (lshiftrt:<DWI>
 	    (mult:<DWI> (zero_extend:<DWI> (match_dup 2))
 			(zero_extend:<DWI> (match_dup 3)))
-	    (match_operand:QI 4 "const_int_operand" "n"))))]
+	    (match_operand:QI 4 "const_int_operand"))))]
   "TARGET_BMI2 && INTVAL (operands[4]) == <MODE_SIZE> * BITS_PER_UNIT
-   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+   && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
   "mulx\t{%3, %0, %1|%1, %0, %3}"
   [(set_attr "type" "imulx")
    (set_attr "prefix" "vex")
@@ -9081,7 +9149,7 @@
 (define_insn_and_split "*udivmod<mode>4_pow2"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
 	(udiv:SWI48 (match_operand:SWI48 2 "register_operand" "0")
-		    (match_operand:SWI48 3 "const_int_operand" "n")))
+		    (match_operand:SWI48 3 "const_int_operand")))
    (set (match_operand:SWI48 1 "register_operand" "=r")
 	(umod:SWI48 (match_dup 2) (match_dup 3)))
    (clobber (reg:CC FLAGS_REG))]
@@ -9162,7 +9230,7 @@
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
 	  (udiv:SI (match_operand:SI 2 "register_operand" "0")
-		   (match_operand:SI 3 "const_int_operand" "n"))))
+		   (match_operand:SI 3 "const_int_operand"))))
    (set (match_operand:SI 1 "register_operand" "=r")
 	(umod:SI (match_dup 2) (match_dup 3)))
    (clobber (reg:CC FLAGS_REG))]
@@ -9246,7 +9314,7 @@
   [(set (match_operand:DI 1 "register_operand" "=r")
 	(zero_extend:DI
 	  (umod:SI (match_operand:SI 2 "register_operand" "0")
-		   (match_operand:SI 3 "const_int_operand" "n"))))
+		   (match_operand:SI 3 "const_int_operand"))))
    (set (match_operand:SI 0 "register_operand" "=r")
 	(udiv:SI (match_dup 2) (match_dup 3)))
    (clobber (reg:CC FLAGS_REG))]
@@ -9313,7 +9381,7 @@
 ;; Avoid sign-extension (using cdq) for constant numerators.
 (define_insn_and_split "*divmodsi4_const"
   [(set (match_operand:SI 0 "register_operand" "=&a")
-	(div:SI (match_operand:SI 2 "const_int_operand" "n")
+	(div:SI (match_operand:SI 2 "const_int_operand")
 		(match_operand:SI 3 "nonimmediate_operand" "rm")))
    (set (match_operand:SI 1 "register_operand" "=&d")
 	(mod:SI (match_dup 2) (match_dup 3)))
@@ -9503,14 +9571,14 @@
   [(set (reg FLAGS_REG)
 	(compare
 	  (and:QI
-	    (match_operand:QI 0 "nonimmediate_operand" "%qm,*a,qm,r")
-	    (match_operand:QI 1 "nonmemory_operand" "q,n,n,n"))
+	    (match_operand:QI 0 "nonimmediate_operand" "%qm,qm,r")
+	    (match_operand:QI 1 "nonmemory_operand" "q,n,n"))
 	  (const_int 0)))]
   "ix86_match_ccmode (insn,
 		      CONST_INT_P (operands[1])
 		      && INTVAL (operands[1]) >= 0 ? CCNOmode : CCZmode)"
 {
-  if (which_alternative == 3)
+  if (get_attr_mode (insn) == MODE_SI)
     {
       if (CONST_INT_P (operands[1]) && INTVAL (operands[1]) < 0)
 	operands[1] = GEN_INT (INTVAL (operands[1]) & 0xff);
@@ -9519,8 +9587,16 @@
   return "test{b}\t{%1, %0|%0, %1}";
 }
   [(set_attr "type" "test")
-   (set_attr "mode" "QI,QI,QI,SI")
-   (set_attr "pent_pair" "uv,uv,np,np")])
+   (set (attr "mode")
+     (cond [(eq_attr "alternative" "2")
+	      (const_string "SI")
+	    (and (match_test "optimize_insn_for_size_p ()")
+		 (and (match_operand 0 "ext_QIreg_operand")
+		      (match_operand 1 "const_0_to_127_operand")))
+	      (const_string "SI")
+	   ]
+	   (const_string "QI")))
+   (set_attr "pent_pair" "uv,np,np")])
 
 (define_insn "*test<mode>_1"
   [(set (reg FLAGS_REG)
@@ -9590,8 +9666,8 @@
         (match_operator 1 "compare_operator"
 	  [(zero_extract:SWI248
 	     (match_operand 2 "int_nonimmediate_operand" "rm")
-	     (match_operand 3 "const_int_operand" "n")
-	     (match_operand 4 "const_int_operand" "n"))
+	     (match_operand 3 "const_int_operand")
+	     (match_operand 4 "const_int_operand"))
 	   (const_int 0)]))]
   "/* Ensure that resulting mask is zero or sign extended operand.  */
    INTVAL (operands[4]) >= 0
@@ -9692,6 +9768,27 @@
   operands[2] = gen_rtx_AND (mode, val, immed_wide_int_const (mask, mode));
 })
 
+;; Split and;cmp (as optimized by combine) into not;test
+;; Except when TARGET_BMI provides andn (*andn_<mode>_ccno).
+(define_insn_and_split "*test<mode>_not"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ
+	  (and:SWI
+	    (not:SWI (match_operand:SWI 0 "register_operand"))
+	    (match_operand:SWI 1 "<nonmemory_szext_operand>"))
+	  (const_int 0)))]
+  "ix86_pre_reload_split ()
+   && (!TARGET_BMI || !REG_P (operands[1]))"
+  "#"
+  "&& 1"
+  [(set (match_dup 2) (not:SWI (match_dup 0)))
+   (set (reg:CCZ FLAGS_REG)
+	(compare:CCZ (and:SWI (match_dup 2) (match_dup 1))
+		     (const_int 0)))]
+{
+  operands[2] = gen_reg_rtx (<MODE>mode);
+})
+
 ;; Convert HImode/SImode test instructions with immediate to QImode ones.
 ;; i386 does not allow to encode test with 8bit sign extended immediate, so
 ;; this is relatively important trick.
@@ -10111,7 +10208,7 @@
 			 CONST_INT_P (operands[2])
 			 && INTVAL (operands[2]) >= 0 ? CCNOmode : CCZmode)"
 {
-  if (which_alternative == 2)
+  if (get_attr_mode (insn) == MODE_SI)
     {
       if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) < 0)
         operands[2] = GEN_INT (INTVAL (operands[2]) & 0xff);
@@ -10120,7 +10217,15 @@
   return "and{b}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+   (set (attr "mode")
+     (cond [(eq_attr "alternative" "2")
+	      (const_string "SI")
+	    (and (match_test "optimize_insn_for_size_p ()")
+		 (and (match_operand 0 "ext_QIreg_operand")
+		      (match_operand 2 "const_0_to_127_operand")))
+	      (const_string "SI")
+	   ]
+	   (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -10386,6 +10491,40 @@
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "direct, double")
    (set_attr "mode" "<MODE>")])
+
+;; Split *andnsi_1 after reload with -Oz when not;and is shorter.
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(and:SI (not:SI (match_operand:SI 1 "register_operand"))
+		(match_operand:SI 2 "nonimmediate_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed
+   && optimize_insn_for_size_p () && optimize_size > 1
+   && REGNO (operands[0]) == REGNO (operands[1])
+   && LEGACY_INT_REG_P (operands[0])
+   && !REX_INT_REG_P (operands[2])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])"
+  [(set (match_dup 0) (not:SI (match_dup 1)))
+   (parallel [(set (match_dup 0) (and:SI (match_dup 0) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+;; Split *andn_si_ccno with -Oz when not;test is shorter.
+(define_split
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 1 "compare_operator"
+	  [(and:SI (not:SI (match_operand:SI 2 "general_reg_operand"))
+		   (match_operand:SI 3 "nonimmediate_operand"))
+	   (const_int 0)]))
+   (clobber (match_dup 2))]
+  "reload_completed
+   && optimize_insn_for_size_p () && optimize_size > 1
+   && LEGACY_INT_REG_P (operands[2])
+   && !REX_INT_REG_P (operands[3])
+   && !reg_overlap_mentioned_p (operands[2], operands[3])"
+  [(set (match_dup 2) (not:SI (match_dup 2)))
+   (set (match_dup 0) (match_op_dup 1
+                        [(and:SI (match_dup 3) (match_dup 2))
+			 (const_int 0)]))])
 
 ;; Logical inclusive and exclusive OR instructions
 
@@ -10930,6 +11069,90 @@
      (clobber (reg:CC FLAGS_REG))])]
   "split_double_mode (<DWI>mode, &operands[0], 2, &operands[0], &operands[2]);")
 
+;; Convert:
+;;   mov %esi, %edx
+;;   negl %eax
+;;   adcl $0, %edx
+;;   negl %edx
+;; to:
+;;   xorl %edx, %edx
+;;   negl %eax
+;;   sbbl %esi, %edx
+
+(define_peephole2
+  [(set (match_operand:SWI48 0 "general_reg_operand")
+	(match_operand:SWI48 1 "nonimmediate_gr_operand"))
+   (parallel
+    [(set (reg:CCC FLAGS_REG)
+	  (ne:CCC (match_operand:SWI48 2 "general_reg_operand") (const_int 0)))
+     (set (match_dup 2) (neg:SWI48 (match_dup 2)))])
+   (parallel
+    [(set (match_dup 0)
+	  (plus:SWI48 (plus:SWI48
+			(ltu:SWI48 (reg:CC FLAGS_REG) (const_int 0))
+			(match_dup 0))
+		      (const_int 0)))
+     (clobber (reg:CC FLAGS_REG))])
+   (parallel
+    [(set (match_dup 0)
+	  (neg:SWI48 (match_dup 0)))
+     (clobber (reg:CC FLAGS_REG))])]
+  "REGNO (operands[0]) != REGNO (operands[2])
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[2], operands[1])"
+  [(parallel
+    [(set (reg:CCC FLAGS_REG)
+	  (ne:CCC (match_dup 2) (const_int 0)))
+     (set (match_dup 2) (neg:SWI48 (match_dup 2)))])
+   (parallel
+    [(set (match_dup 0)
+	  (minus:SWI48 (minus:SWI48
+			 (match_dup 0)
+			 (ltu:SWI48 (reg:CC FLAGS_REG) (const_int 0)))
+		       (match_dup 1)))
+     (clobber (reg:CC FLAGS_REG))])]
+  "ix86_expand_clear (operands[0]);")
+
+;; Convert:
+;;   xorl %edx, %edx
+;;   negl %eax
+;;   adcl $0, %edx
+;;   negl %edx
+;; to:
+;;   negl %eax
+;;   sbbl %edx, %edx	// *x86_mov<mode>cc_0_m1
+
+(define_peephole2
+  [(parallel
+    [(set (match_operand:SWI48 0 "general_reg_operand") (const_int 0))
+     (clobber (reg:CC FLAGS_REG))])
+   (parallel
+    [(set (reg:CCC FLAGS_REG)
+	  (ne:CCC (match_operand:SWI48 1 "general_reg_operand") (const_int 0)))
+     (set (match_dup 1) (neg:SWI48 (match_dup 1)))])
+   (parallel
+    [(set (match_dup 0)
+	  (plus:SWI48 (plus:SWI48
+			(ltu:SWI48 (reg:CC FLAGS_REG) (const_int 0))
+			(match_dup 0))
+		      (const_int 0)))
+     (clobber (reg:CC FLAGS_REG))])
+   (parallel
+    [(set (match_dup 0)
+	  (neg:SWI48 (match_dup 0)))
+     (clobber (reg:CC FLAGS_REG))])]
+  "REGNO (operands[0]) != REGNO (operands[1])"
+  [(parallel
+    [(set (reg:CCC FLAGS_REG)
+	  (ne:CCC (match_dup 1) (const_int 0)))
+     (set (match_dup 1) (neg:SWI48 (match_dup 1)))])
+   (parallel
+    [(set (match_dup 0)
+	  (if_then_else:SWI48 (ltu:SWI48 (reg:CC FLAGS_REG) (const_int 0))
+			      (const_int -1)
+			      (const_int 0)))
+     (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "*neg<mode>_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(neg:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")))
@@ -11018,6 +11241,14 @@
   [(set_attr "type" "negnot")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "x86_neg<mode>_ccc"
+  [(parallel
+    [(set (reg:CCC FLAGS_REG)
+	  (ne:CCC (match_operand:SWI48 1 "register_operand")
+		  (const_int 0)))
+     (set (match_operand:SWI48 0 "register_operand")
+	  (neg:SWI48 (match_dup 1)))])])
+
 (define_insn "*negqi_ext<mode>_2"
   [(set (zero_extract:SWI248
 	  (match_operand:SWI248 0 "register_operand" "+Q")
@@ -11659,11 +11890,16 @@
 	(ashift:<DWI>
 	  (match_operand:<DWI> 1 "register_operand")
 	  (subreg:QI
-	    (and:SI
-	      (match_operand:SI 2 "register_operand" "c")
-	      (match_operand:SI 3 "const_int_operand")) 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "(INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) == 0
+	    (and
+	      (match_operand 2 "register_operand" "c")
+	      (match_operand 3 "const_int_operand")) 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "((INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) == 0
+    || ((INTVAL (operands[3]) & (2 * <MODE_SIZE> * BITS_PER_UNIT - 1))
+	 == (2 * <MODE_SIZE> * BITS_PER_UNIT - 1)))
+   && GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT
+   && IN_RANGE (GET_MODE_SIZE (GET_MODE (operands[2])), 2,
+		4 << (TARGET_64BIT ? 1 : 0))
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -11681,6 +11917,15 @@
 	   (ashift:DWIH (match_dup 5) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
+  if ((INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) != 0)
+    {
+      operands[2] = force_reg (GET_MODE (operands[2]), operands[2]);
+      operands[2] = gen_lowpart (QImode, operands[2]);
+      emit_insn (gen_ashl<dwi>3_doubleword (operands[0], operands[1],
+					    operands[2]));
+      DONE;
+    }
+
   split_double_mode (<DWI>mode, &operands[0], 2, &operands[4], &operands[6]);
 
   operands[8] = GEN_INT (<MODE_SIZE> * BITS_PER_UNIT - 1);
@@ -11689,11 +11934,16 @@
   if ((INTVAL (operands[3]) & ((<MODE_SIZE> * BITS_PER_UNIT) - 1))
       != ((<MODE_SIZE> * BITS_PER_UNIT) - 1))
     {
-      rtx tem = gen_reg_rtx (SImode);
-      emit_insn (gen_andsi3 (tem, operands[2], operands[3]));
-      operands[2] = tem;
+      rtx xops[3];
+      xops[0] = gen_reg_rtx (GET_MODE (operands[2]));
+      xops[1] = operands[2];
+      xops[2] = GEN_INT (INTVAL (operands[3])
+			 & ((<MODE_SIZE> * BITS_PER_UNIT) - 1));
+      ix86_expand_binary_operator (AND, GET_MODE (operands[2]), xops);
+      operands[2] = xops[0];
     }
 
+  operands[2] = force_reg (GET_MODE (operands[2]), operands[2]);
   operands[2] = gen_lowpart (QImode, operands[2]);
 
   if (!rtx_equal_p (operands[6], operands[7]))
@@ -11708,7 +11958,9 @@
 	    (match_operand:QI 2 "register_operand" "c")
 	    (match_operand:QI 3 "const_int_operand"))))
    (clobber (reg:CC FLAGS_REG))]
-  "(INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) == 0
+  "((INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) == 0
+    || ((INTVAL (operands[3]) & (2 * <MODE_SIZE> * BITS_PER_UNIT - 1))
+	 == (2 * <MODE_SIZE> * BITS_PER_UNIT - 1)))
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -11726,6 +11978,13 @@
 	   (ashift:DWIH (match_dup 5) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
+  if ((INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) != 0)
+    {
+      emit_insn (gen_ashl<dwi>3_doubleword (operands[0], operands[1],
+					    operands[2]));
+      DONE;
+    }
+
   split_double_mode (<DWI>mode, &operands[0], 2, &operands[4], &operands[6]);
 
   operands[8] = GEN_INT (<MODE_SIZE> * BITS_PER_UNIT - 1);
@@ -11743,7 +12002,7 @@
     emit_move_insn (operands[6], operands[7]);
 })
 
-(define_insn "*ashl<mode>3_doubleword"
+(define_insn "ashl<mode>3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=&r")
 	(ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0n")
 		    (match_operand:QI 2 "nonmemory_operand" "<S>c")))
@@ -11801,12 +12060,12 @@
 (define_insn "*x86_64_shld_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
         (ior:DI (ashift:DI (match_dup 0)
-			   (match_operand:QI 2 "const_0_to_63_operand" "J"))
+			   (match_operand:QI 2 "const_0_to_63_operand"))
 		(subreg:DI
 		  (lshiftrt:TI
 		    (zero_extend:TI
 		      (match_operand:DI 1 "register_operand" "r"))
-		    (match_operand:QI 3 "const_0_to_255_operand" "N")) 0)))
+		    (match_operand:QI 3 "const_0_to_255_operand")) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT
    && INTVAL (operands[3]) == 64 - INTVAL (operands[2])"
@@ -11865,12 +12124,12 @@
 (define_insn "*x86_shld_1"
   [(set (match_operand:SI 0 "nonimmediate_operand" "+r*m")
         (ior:SI (ashift:SI (match_dup 0)
-			   (match_operand:QI 2 "const_0_to_31_operand" "I"))
+			   (match_operand:QI 2 "const_0_to_31_operand"))
 		(subreg:SI
 		  (lshiftrt:DI
 		    (zero_extend:DI
 		      (match_operand:SI 1 "register_operand" "r"))
-		    (match_operand:QI 3 "const_0_to_63_operand" "J")) 0)))
+		    (match_operand:QI 3 "const_0_to_63_operand")) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "INTVAL (operands[3]) == 32 - INTVAL (operands[2])"
   "shld{l}\t{%2, %1, %0|%0, %1, %2}"
@@ -11955,13 +12214,16 @@
 	(ashift:SWI48
 	  (match_operand:SWI48 1 "nonimmediate_operand")
 	  (subreg:QI
-	    (and:SI
-	      (match_operand:SI 2 "register_operand" "c,r")
-	      (match_operand:SI 3 "const_int_operand")) 0)))
+	    (and
+	      (match_operand 2 "register_operand" "c,r")
+	      (match_operand 3 "const_int_operand")) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (ASHIFT, <MODE>mode, operands)
    && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
       == GET_MODE_BITSIZE (<MODE>mode)-1
+   && GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT
+   && IN_RANGE (GET_MODE_SIZE (GET_MODE (operands[2])), 2,
+		4 << (TARGET_64BIT ? 1 : 0))
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -11970,7 +12232,10 @@
 	   (ashift:SWI48 (match_dup 1)
 			 (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
-  "operands[2] = gen_lowpart (QImode, operands[2]);"
+{
+  operands[2] = force_reg (GET_MODE (operands[2]), operands[2]);
+  operands[2] = gen_lowpart (QImode, operands[2]);
+}
   [(set_attr "isa" "*,bmi2")])
 
 (define_insn_and_split "*ashl<mode>3_mask_1"
@@ -12390,7 +12655,7 @@
   [(set (reg FLAGS_REG)
 	(compare
 	  (ashift:SI (match_operand:SI 1 "register_operand" "0")
-		     (match_operand:QI 2 "const_1_to_31_operand" "I"))
+		     (match_operand:QI 2 "const_1_to_31_operand"))
 	  (const_int 0)))
    (set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2))))]
@@ -12543,13 +12808,16 @@
 	(any_shiftrt:SWI48
 	  (match_operand:SWI48 1 "nonimmediate_operand")
 	  (subreg:QI
-	    (and:SI
-	      (match_operand:SI 2 "register_operand" "c,r")
-	      (match_operand:SI 3 "const_int_operand")) 0)))
+	    (and
+	      (match_operand 2 "register_operand" "c,r")
+	      (match_operand 3 "const_int_operand")) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
    && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
       == GET_MODE_BITSIZE (<MODE>mode)-1
+   && GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT
+   && IN_RANGE (GET_MODE_SIZE (GET_MODE (operands[2])), 2,
+		4 << (TARGET_64BIT ? 1 : 0))
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -12558,7 +12826,10 @@
 	   (any_shiftrt:SWI48 (match_dup 1)
 			      (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
-  "operands[2] = gen_lowpart (QImode, operands[2]);"
+{
+  operands[2] = force_reg (GET_MODE (operands[2]), operands[2]);
+  operands[2] = gen_lowpart (QImode, operands[2]);
+}
   [(set_attr "isa" "*,bmi2")])
 
 (define_insn_and_split "*<insn><mode>3_mask_1"
@@ -12588,11 +12859,16 @@
 	(any_shiftrt:<DWI>
 	  (match_operand:<DWI> 1 "register_operand")
 	  (subreg:QI
-	    (and:SI
-	      (match_operand:SI 2 "register_operand" "c")
-	      (match_operand:SI 3 "const_int_operand")) 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "(INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) == 0
+	    (and
+	      (match_operand 2 "register_operand" "c")
+	      (match_operand 3 "const_int_operand")) 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "((INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) == 0
+    || ((INTVAL (operands[3]) & (2 * <MODE_SIZE> * BITS_PER_UNIT - 1))
+	 == (2 * <MODE_SIZE> * BITS_PER_UNIT - 1)))
+   && GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT
+   && IN_RANGE (GET_MODE_SIZE (GET_MODE (operands[2])), 2,
+		4 << (TARGET_64BIT ? 1 : 0))
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -12610,6 +12886,15 @@
 	   (any_shiftrt:DWIH (match_dup 7) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
+  if ((INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) != 0)
+    {
+      operands[2] = force_reg (GET_MODE (operands[2]), operands[2]);
+      operands[2] = gen_lowpart (QImode, operands[2]);
+      emit_insn (gen_<insn><dwi>3_doubleword (operands[0], operands[1],
+					      operands[2]));
+      DONE;
+    }
+
   split_double_mode (<DWI>mode, &operands[0], 2, &operands[4], &operands[6]);
 
   operands[8] = GEN_INT (<MODE_SIZE> * BITS_PER_UNIT - 1);
@@ -12618,11 +12903,16 @@
   if ((INTVAL (operands[3]) & ((<MODE_SIZE> * BITS_PER_UNIT) - 1))
       != ((<MODE_SIZE> * BITS_PER_UNIT) - 1))
     {
-      rtx tem = gen_reg_rtx (SImode);
-      emit_insn (gen_andsi3 (tem, operands[2], operands[3]));
-      operands[2] = tem;
+      rtx xops[3];
+      xops[0] = gen_reg_rtx (GET_MODE (operands[2]));
+      xops[1] = operands[2];
+      xops[2] = GEN_INT (INTVAL (operands[3])
+			 & ((<MODE_SIZE> * BITS_PER_UNIT) - 1));
+      ix86_expand_binary_operator (AND, GET_MODE (operands[2]), xops);
+      operands[2] = xops[0];
     }
 
+  operands[2] = force_reg (GET_MODE (operands[2]), operands[2]);
   operands[2] = gen_lowpart (QImode, operands[2]);
 
   if (!rtx_equal_p (operands[4], operands[5]))
@@ -12637,7 +12927,9 @@
 	    (match_operand:QI 2 "register_operand" "c")
 	    (match_operand:QI 3 "const_int_operand"))))
    (clobber (reg:CC FLAGS_REG))]
-  "(INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) == 0
+  "((INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) == 0
+    || ((INTVAL (operands[3]) & (2 * <MODE_SIZE> * BITS_PER_UNIT - 1))
+	 == (2 * <MODE_SIZE> * BITS_PER_UNIT - 1)))
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -12655,6 +12947,13 @@
 	   (any_shiftrt:DWIH (match_dup 7) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
+  if ((INTVAL (operands[3]) & (<MODE_SIZE> * BITS_PER_UNIT)) != 0)
+    {
+      emit_insn (gen_<insn><dwi>3_doubleword (operands[0], operands[1],
+					      operands[2]));
+      DONE;
+    }
+
   split_double_mode (<DWI>mode, &operands[0], 2, &operands[4], &operands[6]);
 
   operands[8] = GEN_INT (<MODE_SIZE> * BITS_PER_UNIT - 1);
@@ -12672,7 +12971,7 @@
     emit_move_insn (operands[4], operands[5]);
 })
 
-(define_insn_and_split "*<insn><mode>3_doubleword"
+(define_insn_and_split "<insn><mode>3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=&r")
 	(any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "<S>c")))
@@ -12724,12 +13023,12 @@
 (define_insn "*x86_64_shrd_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
         (ior:DI (lshiftrt:DI (match_dup 0)
-			     (match_operand:QI 2 "const_0_to_63_operand" "J"))
+			     (match_operand:QI 2 "const_0_to_63_operand"))
 		(subreg:DI
 		  (ashift:TI
 		    (zero_extend:TI
 		      (match_operand:DI 1 "register_operand" "r"))
-		    (match_operand:QI 3 "const_0_to_255_operand" "N")) 0)))
+		    (match_operand:QI 3 "const_0_to_255_operand")) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT
    && INTVAL (operands[3]) == 64 - INTVAL (operands[2])"
@@ -12788,12 +13087,12 @@
 (define_insn "*x86_shrd_1"
   [(set (match_operand:SI 0 "nonimmediate_operand" "+r*m")
         (ior:SI (lshiftrt:SI (match_dup 0)
-			     (match_operand:QI 2 "const_0_to_31_operand" "I"))
+			     (match_operand:QI 2 "const_0_to_31_operand"))
 		(subreg:SI
 		  (ashift:DI
 		    (zero_extend:DI
 		      (match_operand:SI 1 "register_operand" "r"))
-		    (match_operand:QI 3 "const_0_to_63_operand" "J")) 0)))
+		    (match_operand:QI 3 "const_0_to_63_operand")) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "INTVAL (operands[3]) == 32 - INTVAL (operands[2])"
   "shrd{l}\t{%2, %1, %0|%0, %1, %2}"
@@ -13201,7 +13500,7 @@
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
-			  (match_operand:QI 2 "const_1_to_31_operand" "I"))
+			  (match_operand:QI 2 "const_1_to_31_operand"))
 	  (const_int 0)))
    (set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2))))]
@@ -13355,13 +13654,16 @@
 	(any_rotate:SWI
 	  (match_operand:SWI 1 "nonimmediate_operand")
 	  (subreg:QI
-	    (and:SI
-	      (match_operand:SI 2 "register_operand" "c")
-	      (match_operand:SI 3 "const_int_operand")) 0)))
+	    (and
+	      (match_operand 2 "register_operand" "c")
+	      (match_operand 3 "const_int_operand")) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
    && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
       == GET_MODE_BITSIZE (<MODE>mode)-1
+   && GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT
+   && IN_RANGE (GET_MODE_SIZE (GET_MODE (operands[2])), 2,
+		4 << (TARGET_64BIT ? 1 : 0))
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -13370,18 +13672,24 @@
 	   (any_rotate:SWI (match_dup 1)
 			   (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
-  "operands[2] = gen_lowpart (QImode, operands[2]);")
+{
+  operands[2] = force_reg (GET_MODE (operands[2]), operands[2]);
+  operands[2] = gen_lowpart (QImode, operands[2]);
+})
 
 (define_split
   [(set (match_operand:SWI 0 "register_operand")
 	(any_rotate:SWI
 	  (match_operand:SWI 1 "const_int_operand")
 	  (subreg:QI
-	    (and:SI
-	      (match_operand:SI 2 "register_operand")
-	      (match_operand:SI 3 "const_int_operand")) 0)))]
+	    (and
+	      (match_operand 2 "register_operand")
+	      (match_operand 3 "const_int_operand")) 0)))]
  "(INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode) - 1))
-   == GET_MODE_BITSIZE (<MODE>mode) - 1"
+   == GET_MODE_BITSIZE (<MODE>mode) - 1
+  && GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT
+  && IN_RANGE (GET_MODE_SIZE (GET_MODE (operands[2])), 2,
+	       4 << (TARGET_64BIT ? 1 : 0))"
  [(set (match_dup 4) (match_dup 1))
   (set (match_dup 0)
        (any_rotate:SWI (match_dup 4)
@@ -13578,7 +13886,7 @@
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
 	  (rotatert:SI (match_operand:SI 1 "nonimmediate_operand" "rm")
-		       (match_operand:QI 2 "const_0_to_31_operand" "I"))))]
+		       (match_operand:QI 2 "const_0_to_31_operand"))))]
   "TARGET_64BIT && TARGET_BMI2 && !optimize_function_for_size_p (cfun)"
   "rorx\t{%2, %1, %k0|%k0, %1, %2}"
   [(set_attr "type" "rotatex")
@@ -13745,14 +14053,17 @@
 	  (ashift:SWI48
 	    (const_int 1)
 	    (subreg:QI
-	      (and:SI
-		(match_operand:SI 1 "register_operand")
-		(match_operand:SI 2 "const_int_operand")) 0))
+	      (and
+		(match_operand 1 "register_operand")
+		(match_operand 2 "const_int_operand")) 0))
 	  (match_operand:SWI48 3 "register_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_USE_BT
    && (INTVAL (operands[2]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
       == GET_MODE_BITSIZE (<MODE>mode)-1
+   && GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
+   && IN_RANGE (GET_MODE_SIZE (GET_MODE (operands[1])), 2,
+		4 << (TARGET_64BIT ? 1 : 0))
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -13763,7 +14074,10 @@
 			   (match_dup 1))
 	     (match_dup 3)))
       (clobber (reg:CC FLAGS_REG))])]
-  "operands[1] = gen_lowpart (QImode, operands[1]);")
+{
+  operands[1] = force_reg (GET_MODE (operands[1]), operands[1]);
+  operands[1] = gen_lowpart (QImode, operands[1]);
+})
 
 (define_insn_and_split "*<btsc><mode>_mask_1"
   [(set (match_operand:SWI48 0 "register_operand")
@@ -13810,14 +14124,17 @@
 	  (rotate:SWI48
 	    (const_int -2)
 	    (subreg:QI
-	      (and:SI
-		(match_operand:SI 1 "register_operand")
-		(match_operand:SI 2 "const_int_operand")) 0))
+	      (and
+		(match_operand 1 "register_operand")
+		(match_operand 2 "const_int_operand")) 0))
 	  (match_operand:SWI48 3 "register_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_USE_BT
    && (INTVAL (operands[2]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
       == GET_MODE_BITSIZE (<MODE>mode)-1
+   && GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
+   && IN_RANGE (GET_MODE_SIZE (GET_MODE (operands[1])), 2,
+		4 << (TARGET_64BIT ? 1 : 0))
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
@@ -13828,7 +14145,10 @@
 			   (match_dup 1))
 	     (match_dup 3)))
       (clobber (reg:CC FLAGS_REG))])]
-  "operands[1] = gen_lowpart (QImode, operands[1]);")
+{
+  operands[1] = force_reg (GET_MODE (operands[1]), operands[1]);
+  operands[1] = gen_lowpart (QImode, operands[1]);
+})
 
 (define_insn_and_split "*btr<mode>_mask_1"
   [(set (match_operand:SWI48 0 "register_operand")
@@ -13931,7 +14251,7 @@
 (define_insn "*btsq_imm"
   [(set (zero_extract:DI (match_operand:DI 0 "nonimmediate_operand" "+rm")
 			 (const_int 1)
-			 (match_operand 1 "const_0_to_63_operand" "J"))
+			 (match_operand 1 "const_0_to_63_operand"))
 	(const_int 1))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
@@ -13944,7 +14264,7 @@
 (define_insn "*btrq_imm"
   [(set (zero_extract:DI (match_operand:DI 0 "nonimmediate_operand" "+rm")
 			 (const_int 1)
-			 (match_operand 1 "const_0_to_63_operand" "J"))
+			 (match_operand 1 "const_0_to_63_operand"))
 	(const_int 0))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
@@ -13957,7 +14277,7 @@
 (define_insn "*btcq_imm"
   [(set (zero_extract:DI (match_operand:DI 0 "nonimmediate_operand" "+rm")
 			 (const_int 1)
-			 (match_operand 1 "const_0_to_63_operand" "J"))
+			 (match_operand 1 "const_0_to_63_operand"))
 	(not:DI (zero_extract:DI (match_dup 0) (const_int 1) (match_dup 1))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
@@ -14179,16 +14499,57 @@
   PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
 })
 
+(define_insn_and_split "*jcc_bt<mode>_mask_1"
+  [(set (pc)
+	(if_then_else (match_operator 0 "bt_comparison_operator"
+			[(zero_extract:SWI48
+			   (match_operand:SWI48 1 "register_operand")
+			   (const_int 1)
+			   (zero_extend:SI
+			     (subreg:QI
+			       (and
+				 (match_operand 2 "register_operand")
+				 (match_operand 3 "const_int_operand")) 0)))])
+		      (label_ref (match_operand 4))
+		      (pc)))
+   (clobber (reg:CC FLAGS_REG))]
+  "(TARGET_USE_BT || optimize_function_for_size_p (cfun))
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1
+   && GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT
+   && IN_RANGE (GET_MODE_SIZE (GET_MODE (operands[2])), 2,
+		4 << (TARGET_64BIT ? 1 : 0))
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:SWI48
+	    (match_dup 1)
+	    (const_int 1)
+	    (match_dup 2))
+	  (const_int 0)))
+   (set (pc)
+	(if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+		      (label_ref (match_dup 4))
+		      (pc)))]
+{
+  operands[2] = force_reg (GET_MODE (operands[2]), operands[2]);
+  operands[2] = gen_lowpart (SImode, operands[2]);
+  operands[0] = shallow_copy_rtx (operands[0]);
+  PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
 ;; Help combine recognize bt followed by cmov
 (define_split
   [(set (match_operand:SWI248 0 "register_operand")
 	(if_then_else:SWI248
-	 (ne
-	  (zero_extract:SWI48
-	   (match_operand:SWI48 1 "register_operand")
-	   (const_int 1)
-	   (zero_extend:SI (match_operand:QI 2 "register_operand")))
-	  (const_int 0))
+	 (match_operator 5 "bt_comparison_operator"
+	  [(zero_extract:SWI48
+	    (match_operand:SWI48 1 "register_operand")
+	    (const_int 1)
+	    (zero_extend:SI (match_operand:QI 2 "register_operand")))
+	   (const_int 0)])
 	 (match_operand:SWI248 3 "nonimmediate_operand")
 	 (match_operand:SWI248 4 "nonimmediate_operand")))]
   "TARGET_USE_BT && TARGET_CMOVE
@@ -14203,6 +14564,8 @@
 			     (match_dup 3)
 			     (match_dup 4)))]
 {
+  if (GET_CODE (operands[5]) == EQ)
+    std::swap (operands[3], operands[4]);
   operands[2] = lowpart_subreg (SImode, operands[2], QImode);
 })
 
@@ -14483,7 +14846,7 @@
 	(unspec:QI
 	  [(match_operand:HF 1 "register_operand" "v")
 	   (match_operand:HF 2 "nonimmediate_operand" "vm")
-	   (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	   (match_operand:SI 3 "const_0_to_31_operand")]
 	  UNSPEC_PCMP))]
   "TARGET_AVX512FP16"
   "vcmpsh\t{%3, %2, %1, %0|%0, %1, %2, %3}"
@@ -16500,7 +16863,7 @@
 	  (zero_extract:SWI48
 	    (match_operand:SWI48 1 "nonimmediate_operand" "rm")
 	    (umin:SWI48 (and:SWI48 (match_dup 2) (const_int 255))
-			(match_operand:SWI48 3 "const_int_operand" "n"))
+			(match_operand:SWI48 3 "const_int_operand"))
 	    (const_int 0))
 	  (const_int 0)))
    (clobber (reg:CC FLAGS_REG))]
@@ -16517,7 +16880,7 @@
 	  (zero_extract:SWI48
 	    (match_operand:SWI48 1 "nonimmediate_operand" "rm")
 	    (umin:SWI48 (zero_extend:SWI48 (match_dup 2))
-			(match_operand:SWI48 3 "const_int_operand" "n"))
+			(match_operand:SWI48 3 "const_int_operand"))
 	    (const_int 0))
 	  (const_int 0)))
    (clobber (reg:CC FLAGS_REG))]
@@ -16535,7 +16898,7 @@
 	    (zero_extract:SWI48
 	      (match_operand:SWI48 1 "nonimmediate_operand" "rm")
 	      (umin:SWI48 (zero_extend:SWI48 (match_dup 2))
-			  (match_operand:SWI48 3 "const_int_operand" "n"))
+			  (match_operand:SWI48 3 "const_int_operand"))
 	      (const_int 0))
 	    (const_int 0))
 	(const_int 0)))
@@ -16575,6 +16938,22 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*bmi2_bzhi_zero_extendsidi_4"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (and:SI
+	    (plus:SI
+	      (ashift:SI (const_int 1)
+			 (match_operand:QI 2 "register_operand" "r"))
+	      (const_int -1))
+	    (match_operand:SI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI2"
+  "bzhi\t{%q2, %q1, %q0|%q0, %q1, %q2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "DI")])
+
 (define_insn "bmi2_pdep_<mode>3"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")
@@ -16602,8 +16981,8 @@
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (zero_extract:SWI48
           (match_operand:SWI48 1 "nonimmediate_operand" "rm")
-          (match_operand 2 "const_0_to_255_operand" "N")
-          (match_operand 3 "const_0_to_255_operand" "N")))
+          (match_operand 2 "const_0_to_255_operand")
+          (match_operand 3 "const_0_to_255_operand")))
    (clobber (reg:CC FLAGS_REG))]
    "TARGET_TBM"
 {
@@ -19226,7 +19605,7 @@
   [(set (match_operand:MODEFH 0 "register_operand" "=x,x,x,v,v")
 	(unspec:MODEFH
 	  [(match_operand:MODEFH 1 "nonimmediate_operand" "0,x,m,v,m")
-	   (match_operand:SI 2 "const_0_to_15_operand" "n,n,n,n,n")]
+	   (match_operand:SI 2 "const_0_to_15_operand")]
 	  UNSPEC_ROUND))]
   "TARGET_SSE4_1"
   "@
@@ -20511,6 +20890,12 @@
    (set_attr "mode" "<MODE>")
    (set_attr "length_immediate" "0")])
 
+(define_expand "x86_mov<mode>cc_0_m1_neg"
+  [(parallel
+    [(set (match_operand:SWI48 0 "register_operand")
+	  (neg:SWI48 (ltu:SWI48 (reg:CCC FLAGS_REG) (const_int 0))))
+     (clobber (reg:CC FLAGS_REG))])])
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
 	(neg:SWI48
@@ -21418,7 +21803,7 @@
 	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")]
 			    UNSPECV_PROBE_STACK_RANGE))
    (set (reg:P SP_REG)
-        (minus:P (reg:P SP_REG) (match_operand:P 2 "const_int_operand" "n")))
+        (minus:P (reg:P SP_REG) (match_operand:P 2 "const_int_operand")))
    (clobber (reg:CC FLAGS_REG))
    (clobber (mem:BLK (scratch)))]
   ""
@@ -21428,7 +21813,7 @@
 (define_insn "@probe_stack_range_<mode>"
   [(set (match_operand:P 0 "register_operand" "=r")
 	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
-			    (match_operand:P 2 "const_int_operand" "n")]
+			    (match_operand:P 2 "const_int_operand")]
 			    UNSPECV_PROBE_STACK_RANGE))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -22872,7 +23257,7 @@
 
 (define_insn "*prefetch_3dnow"
   [(prefetch (match_operand 0 "address_operand" "p")
-	     (match_operand:SI 1 "const_int_operand" "n")
+	     (match_operand:SI 1 "const_int_operand")
 	     (const_int 3))]
   "TARGET_3DNOW || TARGET_PRFCHW || TARGET_PREFETCHWT1"
 {
@@ -23422,7 +23807,7 @@
 (define_insn "@lwp_lwpval<mode>"
   [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")
     	    	     (match_operand:SI 1 "nonimmediate_operand" "rm")
-		     (match_operand:SI 2 "const_int_operand" "i")]
+		     (match_operand:SI 2 "const_int_operand")]
 		    UNSPECV_LWPVAL_INTRINSIC)]
   "TARGET_LWP"
   "lwpval\t{%2, %1, %0|%0, %1, %2}"
@@ -23435,7 +23820,7 @@
   [(set (reg:CCC FLAGS_REG)
 	(unspec_volatile:CCC [(match_operand:SWI48 0 "register_operand" "r")
 			      (match_operand:SI 1 "nonimmediate_operand" "rm")
-			      (match_operand:SI 2 "const_int_operand" "i")]
+			      (match_operand:SI 2 "const_int_operand")]
 			     UNSPECV_LWPINS_INTRINSIC))]
   "TARGET_LWP"
   "lwpins\t{%2, %1, %0|%0, %1, %2}"
@@ -23641,7 +24026,7 @@
    (set_attr "length" "3")])
 
 (define_insn "xabort"
-  [(unspec_volatile [(match_operand:SI 0 "const_0_to_255_operand" "n")]
+  [(unspec_volatile [(match_operand:SI 0 "const_0_to_255_operand")]
 		    UNSPECV_XABORT)]
   "TARGET_RTM"
   "xabort\t%0"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index d8e8656..0dbaacb 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1047,7 +1047,7 @@ Enable shadow stack built-in functions from Control-flow Enforcement
 Technology (CET).
 
 mcet-switch
-Target Undocumented Var(flag_cet_switch) Init(0)
+Target Var(flag_cet_switch) Init(0)
 Turn on CET instrumentation for switch statements that use a jump table and
 an indirect jump.
 
@@ -1210,3 +1210,7 @@ Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX5
 mdirect-extern-access
 Target Var(ix86_direct_extern_access) Init(1)
 Do not use GOT to access external symbols.
+
+-param=x86-stlf-window-ninsns=
+Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
+Instructions number above which STFL stall penalty can be compensated.
diff --git a/gcc/config/i386/lujiazui.md b/gcc/config/i386/lujiazui.md
new file mode 100644
index 0000000..9046c09
--- /dev/null
+++ b/gcc/config/i386/lujiazui.md
@@ -0,0 +1,844 @@
+;; Copyright (C) 2012-2022 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+
+;; Scheduling for ZHAOXIN lujiazui processor.
+
+;; Modeling automatons for decoders, execution pipes and AGU pipes.
+(define_automaton "lujiazui_decoder,lujiazui_core,lujiazui_agu")
+
+;; The rules for the decoder are simple:
+;;  - an instruction with 1 uop can be decoded by any of the three
+;;    decoders in one cycle.
+;;  - an instruction with 2 uops can be decoded by decoder 0 or decoder 1
+;;    but still in only one cycle.
+;;  - a complex (microcode) instruction can only be decoded by
+;;    decoder 0, and this takes an unspecified number of cycles.
+;;
+;; The goal is to schedule such that we have a few-one-two uops sequence
+;; in each cycle, to decode as many instructions per cycle as possible.
+(define_cpu_unit "lua_decoder0" "lujiazui_decoder")
+(define_cpu_unit "lua_decoder1" "lujiazui_decoder")
+(define_cpu_unit "lua_decoder2" "lujiazui_decoder")
+
+;; We first wish to find an instruction for lua_decoder0, so exclude
+;; lua_decoder1 and lua_decoder2 from being reserved until
+;; lua_decoder0 is reserved, and also exclude lua_decoder2
+;; from being reserved until lua_decoder1 is reserved.
+(presence_set "lua_decoder1" "lua_decoder0")
+(presence_set "lua_decoder2" "lua_decoder0")
+(presence_set "lua_decoder2" "lua_decoder1")
+
+;; Most instructions can be decoded on any of the three decoders.
+(define_reservation "lua_decodern" "lua_decoder0|lua_decoder1|lua_decoder2")
+(define_reservation "lua_decoder01" "lua_decoder0|lua_decoder1")
+
+;; The out-of-order core has six pipelines.
+;; Port 4, 5 are responsible for address calculations, load or store.
+;; Port 0, 1, 2, 3 for everything else.
+
+(define_cpu_unit "lua_p0,lua_p1,lua_p2,lua_p3" "lujiazui_core")
+(define_cpu_unit "lua_p4,lua_p5" "lujiazui_agu")
+
+(define_reservation "lua_p03" "lua_p0|lua_p3")
+(define_reservation "lua_p12" "lua_p1|lua_p2")
+(define_reservation "lua_p1p2" "lua_p1+lua_p2")
+(define_reservation "lua_p45" "lua_p4|lua_p5")
+(define_reservation "lua_p4p5" "lua_p4+lua_p5")
+(define_reservation "lua_p0p1p2p3" "lua_p0+lua_p1+lua_p2+lua_p3")
+
+;; Only the irregular instructions have to be modeled here.
+
+;; Complex instruction.
+(define_insn_reservation "lua_complex_insn" 6
+			 (and (eq_attr "cpu" "lujiazui")
+			      (eq_attr "type" "other,multi,str"))
+			 "lua_decoder0")
+
+;; Call instruction.
+(define_insn_reservation "lua_call" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (eq_attr "type" "call,callv"))
+			 "lua_decoder0,lua_p45,lua_p1")
+
+;; MOV - integer moves.
+(define_insn_reservation "lua_imov" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "imov,imovx")))
+			 "lua_decodern,lua_p12")
+
+(define_insn_reservation "lua_imov_load" 4
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "imov,imovx")))
+			 "lua_decoder01,lua_p45")
+
+(define_insn_reservation "lua_imov_store" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "imov")))
+			 "lua_decodern,lua_p12+lua_p45")
+
+(define_insn_reservation "lua_icmov" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none,unknown")
+				   (eq_attr "type" "icmov")))
+			 "lua_decodern,lua_p2")
+
+(define_insn_reservation "lua_icmov_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "icmov")))
+			 "lua_decoder01,lua_p45,lua_p2")
+
+;; Push and pop.
+(define_insn_reservation "lua_push_reg" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "push")))
+			 "lua_decodern,lua_p12+lua_p45")
+
+(define_insn_reservation "lua_push_mem" 4
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "push")))
+			 "lua_decoder01,lua_p45,lua_p12+lua_p45")
+
+(define_insn_reservation "lua_pop_reg" 4
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "pop")))
+			 "lua_decoder01,lua_p45")
+
+(define_insn_reservation "lua_pop_mem" 4
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "pop")))
+			 "lua_decoder0,lua_p45,lua_p12+lua_p45")
+
+(define_insn_reservation "lua_lea" 1
+			 (and (eq_attr "cpu" "lujiazui")
+				  (eq_attr "type" "lea"))
+			 "hsw_decodern,lua_p45")
+
+(define_insn_reservation "lua_shift_rotate" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none,unknown")
+			 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
+			 "lua_decodern,lua_p2")
+
+(define_insn_reservation "lua_shift_rotate_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+			 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
+			 "lua_decoder01,lua_p45,lua_p2")
+
+(define_insn_reservation "lua_shift_rotate_store" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+			 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
+			 "lua_decoder01,lua_p2,lua_p45")
+
+(define_insn_reservation "lua_shift_rotate_both" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "both")
+			 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
+			 "lua_decoder0,lua_p45,lua_p2,lua_p45")
+
+(define_insn_reservation "lua_branch" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "ibr")))
+			 "lua_decodern,lua_p1")
+
+(define_insn_reservation "lua_indirect_branch_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "ibr")))
+			 "lua_decodern,lua_p45,lua_p1")
+
+(define_insn_reservation "lua_leave" 4
+			 (and (eq_attr "cpu" "lujiazui")
+			      (eq_attr "type" "leave"))
+			 "lua_decoder0,lua_p45+lua_p12,lua_p12")
+
+;; Multiplication instructions.
+
+(define_insn_reservation "lua_imul_qi" 2
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "QI")
+					(eq_attr "type" "imul,imulx"))))
+			 "lua_decodern,lua_p1p2")
+
+(define_insn_reservation "lua_imul_qi_mem" 6
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "!none")
+				   (and (eq_attr "mode" "QI")
+					(eq_attr "type" "imul,imulx"))))
+			 "lua_decoder01,lua_p1p2+lua_p45")
+
+(define_insn_reservation "lua_imul_hisi" 3
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "HI,SI")
+					(eq_attr "type" "imul,imulx"))))
+			 "lua_decoder0,lua_p1p2")
+
+(define_insn_reservation "lua_imul_hisi_mem" 7
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "!none")
+				   (and (eq_attr "mode" "HI,SI")
+					(eq_attr "type" "imul,imulx"))))
+			 "lua_decoder0,lua_p1p2+lua_p45")
+
+(define_insn_reservation "lua_imul_di" 12
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "type" "imul,imulx"))))
+			 "lua_decoder0,lua_p0p1p2p3")
+
+(define_insn_reservation "lua_imul_di_mem" 16
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "!none")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "type" "imul,imulx"))))
+			 "lua_decoder0,lua_p0p1p2p3+lua_p45")
+
+;; Division instructions.
+
+(define_insn_reservation "lua_idiv_qi" 21
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "QI")
+					(eq_attr "type" "idiv"))))
+			 "lua_decoder0,lua_p0p1p2p3*21")
+
+(define_insn_reservation "lua_idiv_qi_load" 25
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "QI")
+					(eq_attr "type" "idiv"))))
+			 "lua_decoder0,lua_p45,lua_p0p1p2p3*21")
+
+(define_insn_reservation "lua_idiv_hi" 22
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "HI")
+					(eq_attr "type" "idiv"))))
+			 "lua_decoder0,lua_p0p1p2p3*22")
+
+(define_insn_reservation "lua_idiv_hi_load" 26
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "HI")
+					(eq_attr "type" "idiv"))))
+			 "lua_decoder0,lua_p45,lua_p0p1p2p3*22")
+
+(define_insn_reservation "lua_idiv_si" 20
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "type" "idiv"))))
+			 "lua_decoder0,lua_p0p1p2p3*20")
+
+(define_insn_reservation "lua_idiv_si_load" 24
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "type" "idiv"))))
+			 "lua_decoder0,lua_p45,lua_p0p1p2p3*20")
+
+(define_insn_reservation "lua_idiv_di" 150
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "type" "idiv"))))
+			 "lua_decoder0,lua_p0p1p2p3*150")
+
+(define_insn_reservation "lua_idiv_di_load" 154
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "type" "idiv"))))
+			 "lua_decoder0,lua_p45,lua_p0p1p2p3*150")
+
+;; x87 floating point operations.
+
+(define_insn_reservation "lua_fxch" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (eq_attr "type" "fxch"))
+			 "lua_decodern,lua_p1")
+
+(define_insn_reservation "lua_fop" 3
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none,unknown")
+				   (eq_attr "type" "fop")))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_fop_load" 7
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "fop")))
+			 "lua_decoder01,lua_p45,lua_p0")
+
+(define_insn_reservation "lua_fop_store" 3
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "fop")))
+			 "lua_decodern,lua_p0,lua_p45")
+
+(define_insn_reservation "lua_fop_both" 7
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "fop")))
+			 "lua_decoder0,lua_p45,lua_p0,lua_p45")
+
+(define_insn_reservation "lua_fsgn" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (eq_attr "type" "fsgn"))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_fistp" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "fistp")))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_fistp_mem" 4
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "!none")
+				   (eq_attr "type" "fistp")))
+			 "lua_decoder0,lua_p0+lua_p45")
+
+(define_insn_reservation "lua_fcmov" 3
+			 (and (eq_attr "cpu" "lujiazui")
+			      (eq_attr "type" "fcmov"))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_fcmp" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "fcmp")))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_fcmp_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "fcmp")))
+			 "lua_decoder01,lua_p45,lua_p0")
+
+(define_insn_reservation "lua_fmov" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "fmov")))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_fmov_load" 4
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "!XF")
+					(eq_attr "type" "fmov"))))
+			 "lua_decoder01,lua_p45,lua_p0")
+
+(define_insn_reservation "lua_fmov_XF_load" 3
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "XF")
+					(eq_attr "type" "fmov"))))
+			 "lua_decoder0,lua_p45,lua_p0")
+
+(define_insn_reservation "lua_fmov_store" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+				   (and (eq_attr "mode" "!XF")
+					(eq_attr "type" "fmov"))))
+			 "lua_decoder0,lua_p0,lua_p45")
+
+(define_insn_reservation "lua_fmov_XF_store" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+				   (and (eq_attr "mode" "XF")
+					(eq_attr "type" "fmov"))))
+			 "lua_decoder0,lua_p0,lua_p45")
+
+(define_insn_reservation "lua_fmul" 4
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "fmul")))
+			 "lua_decodern,lua_p3")
+
+(define_insn_reservation "lua_fmul_load" 8
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "fp_int_src" "false")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "fmul"))))
+			 "lua_decoder01,lua_p45,lua_p3")
+
+(define_insn_reservation "lua_fimul_load" 8
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "fp_int_src" "true")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "fmul"))))
+			 "lua_decoder0,lua_p45,lua_p3")
+
+;; fdiv instructions.
+
+(define_insn_reservation "lua_fdiv_SF" 15
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+				    (eq_attr "type" "fdiv,fpspc"))))
+			 "lua_decodern,lua_p0*15")
+
+(define_insn_reservation "lua_fdiv_SF_load" 19
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF")
+				    (eq_attr "type" "fdiv,fpspc"))))
+			 "lua_decoder01,lua_p45,lua_p0*15")
+
+(define_insn_reservation "lua_fdiv_DF" 18
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DF")
+				    (eq_attr "type" "fdiv,fpspc"))))
+			 "lua_decodern,lua_p0*18")
+
+(define_insn_reservation "lua_fdiv_DF_load" 22
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "DF")
+				    (eq_attr "type" "fdiv,fpspc"))))
+			 "lua_decoder01,lua_p45,lua_p0*18")
+
+(define_insn_reservation "lua_fdiv_XF" 22
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "XF")
+				    (eq_attr "type" "fdiv,fpspc"))))
+			 "lua_decoder0,lua_p0*22")
+
+(define_insn_reservation "lua_fdiv_XF_load" 26
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "XF")
+				    (eq_attr "type" "fdiv,fpspc"))))
+			 "lua_decoder0,lua_p45,lua_p0*22")
+
+;; MMX instructions.
+
+(define_insn_reservation "lua_mmx_sse_add_shft" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+			 (eq_attr "type" "mmxadd,sseiadd,mmxshft,sseishft")))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_mmx_sse_add_shft_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+			 (eq_attr "type" "mmxadd,sseiadd,mmxshft,sseishft")))
+			 "lua_decoder01,lua_p45,lua_p0")
+
+(define_insn_reservation "lua_mmx_sse_add_shft_store" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+			 (eq_attr "type" "mmxadd,sseiadd,mmxshft,sseishft")))
+			 "lua_decodern,lua_p0,lua_p45")
+
+(define_insn_reservation "lua_mmx_mul" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "mmxmul,sseimul")))
+			 "lua_decodern,lua_p3")
+
+(define_insn_reservation "lua_mmx_mul_load" 9
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "mmxmul,sseimul")))
+			 "lua_decoder01,lua_p45,lua_p3")
+
+(define_insn_reservation "lua_mmxcvt" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "mmxcvt")))
+			 "lua_decodern,lua_p03")
+
+(define_insn_reservation "lua_mmxcvt_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "mmxcvt")))
+			 "lua_decoder01,lua_p45,lua_p03")
+
+;; The sfence instruction.
+(define_insn_reservation "lua_sse_sfence" 13
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "unknown")
+				   (eq_attr "type" "sse")))
+			 "lua_decoder0,lua_p45")
+
+(define_insn_reservation "lua_sse_SFDF" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "mode" "SF,DF")
+				   (eq_attr "type" "sse")))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_sse_V4SF" 13
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "mode" "V4SF")
+				   (eq_attr "type" "sse")))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_sse_V8SF" 19
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "mode" "V8SF,V4DF")
+				   (eq_attr "type" "sse")))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_sse_add1" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "sseadd1")))
+			 "lua_decoder0,lua_p0")
+
+(define_insn_reservation "lua_sse_add1_load" 8
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "sseadd1")))
+			 "lua_decoder0,lua_p45,lua_p0")
+
+(define_insn_reservation "lua_sse_cmp" 3
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "ssecmp,ssecomi")))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_sse_cmp_load" 7
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "ssecmp,ssecomi")))
+			 "lua_decoder01,lua_p45,lua_p0")
+
+(define_insn_reservation "lua_sse_logic" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "sselog,sselog1")))
+			 "lua_decodern,lua_p03")
+
+(define_insn_reservation "lua_sse_logic_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "sselog,sselog1")))
+			 "lua_decoder01,lua_p45,lua_p03")
+
+(define_insn_reservation "lua_sse_add" 3
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "sseadd")))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_sse_add_load" 7
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "sseadd")))
+			 "lua_decoder01,lua_p45,lua_p0")
+
+(define_insn_reservation "lua_ssemul_ss_ps" 3
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF,V4SF,V8SF")
+					(eq_attr "type" "ssemul"))))
+			 "lua_decodern,lua_p3")
+
+(define_insn_reservation "lua_ssemul_ss_ps_load" 7
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF,V4SF,V8SF")
+					(eq_attr "type" "ssemul"))))
+			 "lua_decoder01,lua_p45,lua_p3")
+
+(define_insn_reservation "lua_ssemul_sd_pd" 4
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DF,V2DF,V4DF")
+					(eq_attr "type" "ssemul"))))
+			 "lua_decodern,lua_p3")
+
+(define_insn_reservation "lua_ssemul_sd_pd_load" 8
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "DF,V2DF,V4DF")
+					(eq_attr "type" "ssemul"))))
+			 "lua_decoder01,lua_p45,lua_p3")
+
+(define_insn_reservation "lua_ssediv_SF" 13
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decodern,lua_p0*13")
+
+(define_insn_reservation "lua_ssediv_load_SF" 17
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decoder01,lua_p45,lua_p0*13")
+
+(define_insn_reservation "lua_ssediv_V4SF" 23
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decodern,lua_p0*23")
+
+(define_insn_reservation "lua_ssediv_load_V4SF" 27
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decoder01,lua_p45,lua_p0*23")
+
+(define_insn_reservation "lua_ssediv_V8SF" 47
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "V8SF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decoder0,lua_p0*47")
+
+(define_insn_reservation "lua_ssediv_load_V8SF" 51
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V8SF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decoder0,lua_p45,lua_p0*47")
+
+(define_insn_reservation "lua_ssediv_SD" 17
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decodern,lua_p0*17")
+
+(define_insn_reservation "lua_ssediv_load_SD" 21
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "DF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decoder01,lua_p45,lua_p0*17")
+
+(define_insn_reservation "lua_ssediv_V2DF" 30
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "V2DF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decodern,lua_p0*30")
+
+(define_insn_reservation "lua_ssediv_load_V2DF" 34
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V2DF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decoder01,lua_p45,lua_p0*30")
+
+(define_insn_reservation "lua_ssediv_V4DF" 56
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "V4DF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decoder0,lua_p0*56")
+
+(define_insn_reservation "lua_ssediv_load_V4DF" 60
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V4DF")
+					(eq_attr "type" "ssediv"))))
+			 "lua_decoder0,lua_p4p5,lua_p0*56")
+
+
+(define_insn_reservation "lua_sseicvt_si" 2
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SI")
+			 (and (match_operand:SF 1 "memory_operand")
+					(eq_attr "type" "sseicvt")))))
+			 "lua_decoder01,lua_p0")
+
+(define_insn_reservation "lua_sseicvt_si_load" 6
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SI")
+			 (and (match_operand:SF 1 "memory_operand")
+					(eq_attr "type" "sseicvt")))))
+			 "lua_decoder0,lua_p45,lua_p0")
+
+(define_insn_reservation "lua_sseicvtdf_si" 3
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SI")
+			 (and (match_operand:DF 1 "memory_operand")
+					(eq_attr "type" "sseicvt")))))
+			 "lua_decodern,lua_p0")
+
+(define_insn_reservation "lua_sseicvtdf_si_load" 7
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SI")
+			 (and (match_operand:DF 1 "memory_operand")
+					(eq_attr "type" "sseicvt")))))
+			 "lua_decoder01,lua_p45,lua_p0")
+
+(define_insn_reservation "lua_ssecvt" 6
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "ssecvt")))
+			 "lua_decoder01,lua_p03")
+
+(define_insn_reservation "lua_ssecvt_load" 10
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "ssecvt")))
+			 "lua_decoder0,lua_p45,lua_p03")
+
+(define_insn_reservation "lua_sse_mov" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "ssemov")))
+			 "lua_decodern,lua_p03")
+
+(define_insn_reservation "lua_sse_mov_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "ssemov")))
+			 "lua_decoder01,lua_p45,lua_p03")
+
+(define_insn_reservation "lua_sse_mov_store" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "ssemov")))
+			 "lua_decoder01,lua_p0,lua_p45")
+
+(define_insn_reservation "lua_insn_alu" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none,unknown")
+				   (eq_attr "type" "alu")))
+			 "lua_decodern,lua_p12")
+
+(define_insn_reservation "lua_insn_alu_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "alu")))
+			 "lua_decoder01,lua_p45,lua_p12")
+
+(define_insn_reservation "lua_insn_alu_store" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "alu")))
+			 "lua_decoder01,lua_p12,lua_p45")
+
+(define_insn_reservation "lua_insn_alu_both" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "alu")))
+			 "lua_decoder0,lua_p45,lua_p12,lua_p45")
+
+(define_insn_reservation "lua_insn_alu1" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none,unknown")
+				   (eq_attr "type" "alu1")))
+			 "lua_decodern,lua_p12")
+
+(define_insn_reservation "lua_insn_alu1_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "alu1")))
+			 "lua_decoder01,lua_p45,lua_p12")
+
+(define_insn_reservation "lua_insn_alu1_store" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "alu1")))
+			 "lua_decoder01,lua_p12,lua_p45")
+
+(define_insn_reservation "lua_insn_alu1_both" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "alu1")))
+			 "lua_decoder0,lua_p45,lua_p12,lua_p45")
+
+(define_insn_reservation "lua_insn_negnot_incdec" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "negnot,incdec")))
+			 "lua_decodern,lua_p12")
+
+(define_insn_reservation "lua_insn_negnot_setcc" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "setcc")))
+			 "lua_decodern,lua_p2")
+
+(define_insn_reservation "lua_insn_negnot_setcc_mem" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "!none")
+				   (eq_attr "type" "negnot,setcc")))
+			 "lua_decoder01,lua_p45,lua_p2,lua_p45")
+
+(define_insn_reservation "lua_insn_incdec_mem" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "!none")
+				   (eq_attr "type" "incdec")))
+			 "lua_decoder0,lua_p45,lua_p12,lua_p45")
+
+(define_insn_reservation "lua_insn_icmptest" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none,unknown")
+				   (eq_attr "type" "icmp,test")))
+			 "lua_decodern,lua_p12")
+
+(define_insn_reservation "lua_insn_icmptest_load" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "icmp,test")))
+			 "lua_decoder01,lua_p45,lua_p12")
+
+(define_insn_reservation "lua_insn_icmptest_store" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "icmp,test")))
+			 "lua_decoder01,lua_p12,lua_p45")
+
+(define_insn_reservation "lua_insn_icmptest_both" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "icmp,test")))
+			 "lua_decoder0,lua_p45,lua_p12,lua_p45")
+
+(define_insn_reservation "lua_insn_sseishft1_mmx" 1
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "sseishft1,mmx,mmxcmp")))
+			 "lua_decodern,lua_p03")
+
+(define_insn_reservation "lua_insn_sseishft1_mmx_mem" 5
+			 (and (eq_attr "cpu" "lujiazui")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "sseishft1,mmx,mmxcmp")))
+			 "lua_decoder01,lua_p45,lua_p03")
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 29d470b..ba53007a 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -462,7 +462,7 @@
 		    (const_string "TI"))
 	    (eq_attr "alternative" "5")
 	      (cond [(match_test "TARGET_AVX512FP16")
-		       (const_string "HI")
+		       (const_string "HF")
 		     (match_test "TARGET_AVX")
 		       (const_string "TI")
 		     (ior (not (match_test "TARGET_SSE2"))
@@ -3454,7 +3454,7 @@
   [(set (match_operand:HI 0 "register_sse4nonimm_operand" "=r,r,m")
 	(vec_select:HI
 	  (match_operand:V4HI 1 "register_operand" "y,YW,YW")
-	  (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n,n,n")])))]
+	  (parallel [(match_operand:SI 2 "const_0_to_3_operand")])))]
   "(TARGET_MMX || TARGET_MMX_WITH_SSE)
    && (TARGET_SSE || TARGET_3DNOW_A)"
   "@
@@ -3473,7 +3473,7 @@
 	(zero_extend:SWI48
 	  (vec_select:HI
 	    (match_operand:V4HI 1 "register_operand" "y,YW")
-	    (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n,n")]))))]
+	    (parallel [(match_operand:SI 2 "const_0_to_3_operand")]))))]
   "(TARGET_MMX || TARGET_MMX_WITH_SSE)
    && (TARGET_SSE || TARGET_3DNOW_A)"
   "@
@@ -3490,7 +3490,7 @@
   [(set (match_operand:QI 0 "nonimmediate_operand" "=r,m")
 	(vec_select:QI
 	  (match_operand:V8QI 1 "register_operand" "YW,YW")
-	  (parallel [(match_operand:SI 2 "const_0_to_7_operand" "n,n")])))]
+	  (parallel [(match_operand:SI 2 "const_0_to_7_operand")])))]
   "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
   "@
    %vpextrb\t{%2, %1, %k0|%k0, %1, %2}
@@ -3507,7 +3507,7 @@
 	(zero_extend:SWI248
 	  (vec_select:QI
 	    (match_operand:V8QI 1 "register_operand" "YW")
-	    (parallel [(match_operand:SI 2 "const_0_to_7_operand" "n")]))))]
+	    (parallel [(match_operand:SI 2 "const_0_to_7_operand")]))))]
   "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
   "%vpextrb\t{%2, %1, %k0|%k0, %1, %2}"
   [(set_attr "type" "sselog1")
@@ -3630,7 +3630,7 @@
 	(vec_merge:V4HI
 	  (match_operand:V4HI 2 "register_operand" "Yr,*x,x")
 	  (match_operand:V4HI 1 "register_operand" "0,0,x")
-	  (match_operand:SI 3 "const_0_to_15_operand" "n,n,n")))]
+	  (match_operand:SI 3 "const_0_to_15_operand")))]
   "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
   "@
    pblendw\t{%3, %2, %0|%0, %2, %3}
@@ -3648,7 +3648,7 @@
 	(vec_merge:V2HI
 	  (match_operand:V2HI 2 "register_operand" "Yr,*x,x")
 	  (match_operand:V2HI 1 "register_operand" "0,0,x")
-	  (match_operand:SI 3 "const_0_to_7_operand" "n,n,n")))]
+	  (match_operand:SI 3 "const_0_to_7_operand")))]
   "TARGET_SSE4_1"
   "@
    pblendw\t{%3, %2, %0|%0, %2, %3}
@@ -4035,7 +4035,7 @@
   [(set (match_operand:HI 0 "register_sse4nonimm_operand" "=r,m")
 	(vec_select:HI
 	  (match_operand:V2HI 1 "register_operand" "YW,YW")
-	  (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n,n")])))]
+	  (parallel [(match_operand:SI 2 "const_0_to_1_operand")])))]
   "TARGET_SSE2"
   "@
    %vpextrw\t{%2, %1, %k0|%k0, %1, %2}
@@ -4051,7 +4051,7 @@
 	(zero_extend:SWI48
 	  (vec_select:HI
 	    (match_operand:V2HI 1 "register_operand" "YW")
-	    (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n")]))))]
+	    (parallel [(match_operand:SI 2 "const_0_to_1_operand")]))))]
   "TARGET_SSE2"
   "%vpextrw\t{%2, %1, %k0|%k0, %1, %2}"
   [(set_attr "type" "sselog1")
@@ -4063,7 +4063,7 @@
   [(set (match_operand:QI 0 "nonimmediate_operand" "=r,m")
 	(vec_select:QI
 	  (match_operand:V4QI 1 "register_operand" "YW,YW")
-	  (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n,n")])))]
+	  (parallel [(match_operand:SI 2 "const_0_to_3_operand")])))]
   "TARGET_SSE4_1"
   "@
    %vpextrb\t{%2, %1, %k0|%k0, %1, %2}
@@ -4080,7 +4080,7 @@
 	(zero_extend:SWI248
 	  (vec_select:QI
 	    (match_operand:V4QI 1 "register_operand" "YW")
-	    (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")]))))]
+	    (parallel [(match_operand:SI 2 "const_0_to_3_operand")]))))]
   "TARGET_SSE4_1"
   "%vpextrb\t{%2, %1, %k0|%k0, %1, %2}"
   [(set_attr "type" "sselog1")
@@ -4405,13 +4405,21 @@
    (set_attr "type" "sseiadd")
    (set_attr "mode" "TI")])
 
-(define_insn "mmx_psadbw"
+(define_expand "mmx_psadbw"
+  [(set (match_operand:V1DI 0 "register_operand")
+	(unspec:V1DI [(match_operand:V8QI 1 "register_mmxmem_operand")
+		      (match_operand:V8QI 2 "register_mmxmem_operand")]
+		     UNSPEC_PSADBW))]
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && (TARGET_SSE || TARGET_3DNOW_A)"
+  "ix86_fixup_binary_operands_no_copy (PLUS, V8QImode, operands);")
+
+(define_insn "*mmx_psadbw"
   [(set (match_operand:V1DI 0 "register_operand" "=y,x,Yw")
-        (unspec:V1DI [(match_operand:V8QI 1 "register_operand" "0,0,Yw")
+	(unspec:V1DI [(match_operand:V8QI 1 "register_mmxmem_operand" "%0,0,Yw")
 		      (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yw")]
 		     UNSPEC_PSADBW))]
-  "(TARGET_MMX || TARGET_MMX_WITH_SSE)
-   && (TARGET_SSE || TARGET_3DNOW_A)"
+  "(TARGET_MMX || TARGET_MMX_WITH_SSE) && (TARGET_SSE || TARGET_3DNOW_A)
+   && ix86_binary_operator_ok (PLUS, V8QImode, operands)"
   "@
    psadbw\t{%2, %0|%0, %2}
    psadbw\t{%2, %0|%0, %2}
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index a8cc17a..128144f 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -672,6 +672,12 @@
 {
   if (SUBREG_P (op))
     op = SUBREG_REG (op);
+
+  /* Before reload, we can allow (SUBREG (MEM...)) as a register operand
+     because it is guaranteed to be reloaded into one.  */
+  if (MEM_P (op))
+    return true;
+
   return !(op == arg_pointer_rtx
 	   || op == frame_pointer_rtx
 	   || IN_RANGE (REGNO (op),
@@ -685,6 +691,7 @@
 {
   if (SUBREG_P (op))
     op = SUBREG_REG (op);
+
   if (reload_completed)
     return REG_OK_FOR_INDEX_STRICT_P (op);
   else
@@ -906,6 +913,11 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 0, 63)")))
 
+;; Match 0 to 127.
+(define_predicate "const_0_to_127_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 127)")))
+
 ;; Match 0 to 255.
 (define_predicate "const_0_to_255_operand"
   (and (match_code "const_int")
diff --git a/gcc/config/i386/smmintrin.h b/gcc/config/i386/smmintrin.h
index b42b212..eb6a451 100644
--- a/gcc/config/i386/smmintrin.h
+++ b/gcc/config/i386/smmintrin.h
@@ -810,17 +810,11 @@ _mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
 
 #include <popcntintrin.h>
 
-#ifndef __SSE4_1__
+#ifndef __CRC32__
 #pragma GCC push_options
-#pragma GCC target("sse4.1")
-#define __DISABLE_SSE4_1__
-#endif /* __SSE4_1__ */
-
-#ifndef __SSE4_2__
-#pragma GCC push_options
-#pragma GCC target("sse4.2")
-#define __DISABLE_SSE4_2__
-#endif /* __SSE4_1__ */
+#pragma GCC target("crc32")
+#define __DISABLE_CRC32__
+#endif /* __CRC32__ */
 
 /* Accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
 extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -849,14 +843,9 @@ _mm_crc32_u64 (unsigned long long __C, unsigned long long __V)
 }
 #endif
 
-#ifdef __DISABLE_SSE4_2__
-#undef __DISABLE_SSE4_2__
+#ifdef __DISABLE_CRC32__
+#undef __DISABLE_CRC32__
 #pragma GCC pop_options
-#endif /* __DISABLE_SSE4_2__ */
-
-#ifdef __DISABLE_SSE4_1__
-#undef __DISABLE_SSE4_1__
-#pragma GCC pop_options
-#endif /* __DISABLE_SSE4_1__ */
+#endif /* __DISABLE_CRC32__ */
 
 #endif /* _SMMINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e9292e6..3e3d96f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -301,7 +301,8 @@
 
 ;; All 128bit and 256bit vector modes
 (define_mode_iterator V_128_256
-  [V32QI V16QI V16HI V8HI V8SI V4SI V4DI V2DI V16HF V8HF V8SF V4SF V4DF V2DF])
+  [V32QI V16QI V16HI V8HI V8SI V4SI V4DI V2DI V2TI V1TI
+   V16HF V8HF V8SF V4SF V4DF V2DF])
 
 ;; All 512bit vector modes
 (define_mode_iterator V_512 [V64QI V32HI V16SI V8DI V16SF V8DF])
@@ -327,9 +328,7 @@
 
 ;; 128-, 256- and 512-bit float vector modes for bitwise operations
 (define_mode_iterator VFB
-  [(V32HF "TARGET_AVX512FP16")
-   (V16HF "TARGET_AVX512FP16")
-   (V8HF "TARGET_AVX512FP16")
+  [(V32HF "TARGET_AVX512F") (V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
    (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
    (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
@@ -340,8 +339,7 @@
 
 ;; 128- and 256-bit float vector modes for bitwise operations
 (define_mode_iterator VFB_128_256
-  [(V16HF "TARGET_AVX512FP16")
-   (V8HF "TARGET_AVX512FP16")
+  [(V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
    (V8SF "TARGET_AVX") V4SF
    (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
 
@@ -399,7 +397,7 @@
 
 ;; All 512bit vector float modes for bitwise operations
 (define_mode_iterator VFB_512
-  [(V32HF "TARGET_AVX512FP16") V16SF V8DF])
+  [V32HF V16SF V8DF])
 
 (define_mode_iterator VI48_AVX512VL
   [V16SI (V8SI  "TARGET_AVX512VL") (V4SI  "TARGET_AVX512VL")
@@ -469,9 +467,9 @@
 
 ;; All DImode vector integer modes
 (define_mode_iterator V_AVX
-  [V16QI V8HI V4SI V2DI V4SF V2DF
+  [V16QI V8HI V4SI V2DI V1TI V4SF V2DF
    (V32QI "TARGET_AVX") (V16HI "TARGET_AVX")
-   (V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
+   (V8SI "TARGET_AVX") (V4DI "TARGET_AVX") (V2TI "TARGET_AVX")
    (V8SF "TARGET_AVX") (V4DF"TARGET_AVX")])
 
 (define_mode_iterator VI48_AVX
@@ -893,15 +891,16 @@
   [(V4SF "sse4_1") (V2DF "sse4_1")
    (V8SF "avx") (V4DF "avx")
    (V8DF "avx512f")
+   (V2TI "avx") (V1TI "sse4_1")
    (V4DI "avx") (V2DI "sse4_1")
    (V8SI "avx") (V4SI "sse4_1")
    (V16QI "sse4_1") (V32QI "avx")
    (V8HI "sse4_1") (V16HI "avx")])
 
 (define_mode_attr avxsizesuffix
-  [(V64QI "512") (V32HI "512") (V16SI "512") (V8DI "512")
-   (V32QI "256") (V16HI "256") (V8SI "256") (V4DI "256")
-   (V16QI "") (V8HI "") (V4SI "") (V2DI "")
+  [(V64QI "512") (V32HI "512") (V16SI "512") (V8DI "512") (V4TI "512")
+   (V32QI "256") (V16HI "256") (V8SI "256") (V4DI "256") (V2TI "256")
+   (V16QI "") (V8HI "") (V4SI "") (V2DI "") (V1TI "")
    (V32HF "512") (V16SF "512") (V8DF "512")
    (V16HF "256") (V8SF "256") (V4DF "256")
    (V8HF "") (V4SF "") (V2DF "")])
@@ -988,6 +987,15 @@
    (V32HI "V32HI") (V64QI "V64QI")
    (V32QI "V32QI") (V16QI "V16QI")])
 
+;; Mapping of vector modes to an V*HImode of the same size
+(define_mode_attr ssewvecmode
+  [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI")
+   (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")])
+
+(define_mode_attr ssewvecmodelower
+  [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi")
+   (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")])
+
 (define_mode_attr sseintvecmode2
   [(V8DF "XI") (V4DF "OI") (V2DF "TI")
    (V8SF "OI") (V4SF "TI")
@@ -1059,6 +1067,18 @@
    (V4DF "V8SF") (V2DF "V4SF")
    (V32HF "V16SF") (V16HF "V8SF") (V8HF "V4SF")])
 
+(define_mode_attr ssePSmodelower
+  [(V16SI "v16sf") (V8DF "v16sf")
+   (V16SF "v16sf") (V8DI "v16sf")
+   (V64QI "v16sf") (V32QI "v8sf") (V16QI "v4sf")
+   (V32HI "v16sf") (V16HI "v8sf") (V8HI "v4sf")
+   (V8SI "v8sf") (V4SI "v4sf")
+   (V4DI "v8sf") (V2DI "v4sf")
+   (V4TI "v16sf") (V2TI "v8sf") (V1TI "v4sf")
+   (V8SF "v8sf") (V4SF "v4sf")
+   (V4DF "v8sf") (V2DF "v4sf")
+   (V32HF "v16sf") (V16HF "v8sf") (V8HF "v4sf")])
+
 (define_mode_attr ssePSmode2
   [(V8DI "V8SF") (V4DI "V4SF")])
 
@@ -1185,6 +1205,11 @@
    (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
    (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
 
+(define_mode_attr ssepackmodelower
+  [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si")
+   (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si")
+   (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")])
+
 ;; Mapping of the max integer size for xop rotate immediate constraint
 (define_mode_attr sserotatemax
   [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
@@ -1461,7 +1486,7 @@
 	      (match_operand:MODEFH 1 "memory_operand" "m"))
 	    (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
 	    (match_operand:QI 3 "register_operand" "Yk"))
-	  (match_operand:<ssevecmode> 4 "const0_operand" "C")
+	  (match_operand:<ssevecmode> 4 "const0_operand")
 	  (const_int 1)))]
   "TARGET_AVX512F"
   "vmov<ssescalarmodesuffix>\t{%1, %0%{%3%}%N2|%0%{3%}%N2, %1}"
@@ -1586,13 +1611,22 @@
    (set_attr "memory" "store")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "sse2_movq128"
-  [(set (match_operand:V2DI 0 "register_operand" "=v")
+(define_expand "sse2_movq128"
+  [(set (match_operand:V2DI 0 "register_operand")
 	(vec_concat:V2DI
 	  (vec_select:DI
-	    (match_operand:V2DI 1 "nonimmediate_operand" "vm")
+	    (match_operand:V2DI 1 "nonimmediate_operand")
 	    (parallel [(const_int 0)]))
 	  (const_int 0)))]
+  "TARGET_SSE2")
+
+(define_insn "*sse2_movq128_<mode>"
+  [(set (match_operand:VI8F_128 0 "register_operand" "=v")
+	(vec_concat:VI8F_128
+	  (vec_select:<ssescalarmode>
+	    (match_operand:VI8F_128 1 "nonimmediate_operand" "vm")
+	    (parallel [(const_int 0)]))
+	  (match_operand:<ssescalarmode> 2 "const0_operand")))]
   "TARGET_SSE2"
   "%vmovq\t{%1, %0|%0, %q1}"
   [(set_attr "type" "ssemov")
@@ -1986,7 +2020,7 @@
   [(set (match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "=k")
 	(any_lshift:SWI1248_AVX512BWDQ
 	  (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")
-	  (match_operand 2 "const_0_to_255_operand" "n")))
+	  (match_operand 2 "const_0_to_255_operand")))
    (unspec [(const_int 0)] UNSPEC_MASKOP)]
   "TARGET_AVX512F"
   "k<mshift><mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
@@ -3408,7 +3442,7 @@
 	(unspec:VF_128_256
 	  [(match_operand:VF_128_256 1 "register_operand" "x")
 	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm")
-	   (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	   (match_operand:SI 3 "const_0_to_31_operand")]
 	  UNSPEC_PCMP))]
   "TARGET_AVX"
   "vcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
@@ -3608,6 +3642,8 @@
   operands[1] = force_reg (<ssebytemode>mode,
 			  gen_lowpart (<ssebytemode>mode, operands[1]));
   operands[2] = gen_lowpart (<ssebytemode>mode, operands[2]);
+
+  operands[3] = force_reg (<ssebytemode>mode, operands[3]);
   operands[3] = lowpart_subreg (<MODE>mode, operands[3], <ssebytemode>mode);
 })
 
@@ -3617,7 +3653,7 @@
 	  (unspec:VF_128
 	    [(match_operand:VF_128 1 "register_operand" "x")
 	     (match_operand:VF_128 2 "nonimmediate_operand" "xm")
-	     (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	     (match_operand:SI 3 "const_0_to_31_operand")]
 	    UNSPEC_PCMP)
 	 (match_dup 1)
 	 (const_int 1)))]
@@ -3710,7 +3746,7 @@
 	  (unspec:<V48H_AVX512VL:avx512fmaskmode>
 	    [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand")
 	     (match_operand:V48H_AVX512VL 2 "nonimmediate_operand")
-	     (match_operand:SI 3 "const_0_to_7_operand" "n")]
+	     (match_operand:SI 3 "const_0_to_7_operand")]
 	    UNSPEC_PCMP)))]
   "TARGET_AVX512F
    && (!VALID_MASK_AVX512BW_MODE (<SWI248x:MODE>mode) || TARGET_AVX512BW)
@@ -3916,7 +3952,7 @@
 	(unspec:<avx512fmaskmode>
 	  [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
 	   (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
-	   (match_operand:SI 3 "const_0_to_7_operand" "n")]
+	   (match_operand:SI 3 "const_0_to_7_operand")]
 	  UNSPEC_UNSIGNED_PCMP))]
   "TARGET_AVX512BW"
   "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %3}"
@@ -4014,7 +4050,7 @@
 	(unspec:<avx512fmaskmode>
 	  [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
 	   (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")
-	   (match_operand:SI 3 "const_0_to_7_operand" "n")]
+	   (match_operand:SI 3 "const_0_to_7_operand")]
 	  UNSPEC_UNSIGNED_PCMP))]
   "TARGET_AVX512F"
   "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %3}"
@@ -4154,7 +4190,7 @@
 	  (unspec:<avx512fmaskmode>
 	    [(match_operand:VFH_128 1 "register_operand" "v")
 	     (match_operand:VFH_128 2 "<round_saeonly_nimm_scalar_predicate>" "<round_saeonly_constraint>")
-	     (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	     (match_operand:SI 3 "const_0_to_31_operand")]
 	    UNSPEC_PCMP)
 	  (const_int 1)))]
   "TARGET_AVX512F"
@@ -4170,7 +4206,7 @@
 	  (unspec:<avx512fmaskmode>
 	    [(match_operand:VFH_128 1 "register_operand" "v")
 	     (match_operand:VFH_128 2 "<round_saeonly_nimm_scalar_predicate>" "<round_saeonly_constraint>")
-	     (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	     (match_operand:SI 3 "const_0_to_31_operand")]
 	    UNSPEC_PCMP)
 	  (and:<avx512fmaskmode>
 	    (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")
@@ -4355,13 +4391,63 @@
 	(match_operator:V2DI 1 ""
 	  [(match_operand:V2DI 2 "register_operand")
 	   (match_operand:V2DI 3 "vector_operand")]))]
-  "TARGET_SSE4_1"
+  "TARGET_SSE2"
 {
-  bool ok = ix86_expand_int_vec_cmp (operands);
+  bool ok;
+  if (!TARGET_SSE4_1)
+    {
+      rtx ops[4];
+      ops[0] = gen_reg_rtx (V4SImode);
+      ops[2] = gen_lowpart (V4SImode, force_reg (V2DImode, operands[2]));
+      ops[3] = gen_lowpart (V4SImode, force_reg (V2DImode, operands[3]));
+      ops[1] = gen_rtx_fmt_ee (GET_CODE (operands[1]), V4SImode,
+			       ops[2], ops[3]);
+      ok = ix86_expand_int_vec_cmp (ops);
+
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_pshufd (tmp1, ops[0], GEN_INT (0xb1)));
+
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      if (GET_CODE (operands[1]) == EQ)
+	emit_insn (gen_andv4si3 (tmp2, tmp1, ops[0]));
+      else
+	emit_insn (gen_iorv4si3 (tmp2, tmp1, ops[0]));
+
+      emit_move_insn (operands[0], gen_lowpart (V2DImode, tmp2));
+    }
+  else
+    ok = ix86_expand_int_vec_cmp (operands);
   gcc_assert (ok);
   DONE;
 })
 
+(define_expand "vec_cmpeqv1tiv1ti"
+  [(set (match_operand:V1TI 0 "register_operand")
+	(match_operator:V1TI 1 ""
+	  [(match_operand:V1TI 2 "register_operand")
+	   (match_operand:V1TI 3 "vector_operand")]))]
+  "TARGET_SSE2"
+{
+  rtx dst = gen_reg_rtx (V2DImode);
+  rtx op1 = gen_lowpart (V2DImode, force_reg (V1TImode, operands[2]));
+  rtx op2 = gen_lowpart (V2DImode, force_reg (V1TImode, operands[3]));
+  rtx cmp = gen_rtx_fmt_ee (GET_CODE (operands[1]), V2DImode, op1, op2);
+  emit_insn (gen_vec_cmpeqv2div2di (dst, cmp, op1, op2));
+
+  rtx tmp1 = gen_reg_rtx (V4SImode);
+  rtx tmp2 = gen_lowpart (V4SImode, dst);
+  emit_insn (gen_sse2_pshufd (tmp1, tmp2, GEN_INT (0x4e)));
+
+  rtx tmp3 = gen_reg_rtx (V4SImode);
+  if (GET_CODE (operands[1]) == EQ)
+    emit_insn (gen_andv4si3 (tmp3, tmp2, tmp1));
+  else
+    emit_insn (gen_iorv4si3 (tmp3, tmp2, tmp1));
+
+  emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
+  DONE;
+})
+
 (define_expand "vcond<V_512:mode><VF_512:mode>"
   [(set (match_operand:V_512 0 "register_operand")
 	(if_then_else:V_512
@@ -4494,10 +4580,10 @@
 })
 
 (define_expand "vcond_mask_<mode><sseintvecmodelower>"
-  [(set (match_operand:VI124_128 0 "register_operand")
-	(vec_merge:VI124_128
-	  (match_operand:VI124_128 1 "vector_operand")
-	  (match_operand:VI124_128 2 "nonimm_or_0_operand")
+  [(set (match_operand:VI_128 0 "register_operand")
+	(vec_merge:VI_128
+	  (match_operand:VI_128 1 "vector_operand")
+	  (match_operand:VI_128 2 "nonimm_or_0_operand")
 	  (match_operand:<sseintvecmode> 3 "register_operand")))]
   "TARGET_SSE2"
 {
@@ -4506,13 +4592,13 @@
   DONE;
 })
 
-(define_expand "vcond_mask_v2div2di"
-  [(set (match_operand:V2DI 0 "register_operand")
-	(vec_merge:V2DI
-	  (match_operand:V2DI 1 "vector_operand")
-	  (match_operand:V2DI 2 "nonimm_or_0_operand")
-	  (match_operand:V2DI 3 "register_operand")))]
-  "TARGET_SSE4_2"
+(define_expand "vcond_mask_v1tiv1ti"
+  [(set (match_operand:V1TI 0 "register_operand")
+	(vec_merge:V1TI
+	  (match_operand:V1TI 1 "vector_operand")
+	  (match_operand:V1TI 2 "nonimm_or_0_operand")
+	  (match_operand:V1TI 3 "register_operand")))]
+  "TARGET_SSE2"
 {
   ix86_expand_sse_movcc (operands[0], operands[3],
 			 operands[1], operands[2]);
@@ -4557,7 +4643,8 @@
 	  (not:VFB_128_256
 	    (match_operand:VFB_128_256 1 "register_operand" "0,x,v,v"))
 	  (match_operand:VFB_128_256 2 "vector_operand" "xBm,xm,vm,vm")))]
-  "TARGET_SSE && <mask_avx512vl_condition>"
+  "TARGET_SSE && <mask_avx512vl_condition>
+   && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
 {
   char buf[128];
   const char *ops;
@@ -4624,7 +4711,7 @@
 	  (not:VFB_512
 	    (match_operand:VFB_512 1 "register_operand" "v"))
 	  (match_operand:VFB_512 2 "nonimmediate_operand" "vm")))]
-  "TARGET_AVX512F"
+  "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
 {
   char buf[128];
   const char *ops;
@@ -4659,7 +4746,8 @@
        (any_logic:VFB_128_256
          (match_operand:VFB_128_256 1 "vector_operand")
          (match_operand:VFB_128_256 2 "vector_operand")))]
-  "TARGET_SSE && <mask_avx512vl_condition>"
+  "TARGET_SSE && <mask_avx512vl_condition>
+   && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
 (define_expand "<code><mode>3<mask_name>"
@@ -4667,7 +4755,7 @@
        (any_logic:VFB_512
          (match_operand:VFB_512 1 "nonimmediate_operand")
          (match_operand:VFB_512 2 "nonimmediate_operand")))]
-  "TARGET_AVX512F"
+  "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
 (define_insn "*<code><mode>3<mask_name>"
@@ -4676,6 +4764,7 @@
 	  (match_operand:VFB_128_256 1 "vector_operand" "%0,x,v,v")
 	  (match_operand:VFB_128_256 2 "vector_operand" "xBm,xm,vm,vm")))]
   "TARGET_SSE && <mask_avx512vl_condition>
+   && (!<mask_applied> || <ssescalarmode>mode != HFmode)
    && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
 {
   char buf[128];
@@ -4742,7 +4831,8 @@
 	(any_logic:VFB_512
 	  (match_operand:VFB_512 1 "nonimmediate_operand" "%v")
 	  (match_operand:VFB_512 2 "nonimmediate_operand" "vm")))]
-  "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
 {
   char buf[128];
   const char *ops;
@@ -4899,11 +4989,14 @@
 	      ]
 	      (const_string "<ssevecmode>")))])
 
-(define_insn "*andnottf3"
-  [(set (match_operand:TF 0 "register_operand" "=x,x,v,v")
-	(and:TF
-	  (not:TF (match_operand:TF 1 "register_operand" "0,x,v,v"))
-	  (match_operand:TF 2 "vector_operand" "xBm,xm,vm,v")))]
+;; Modes for andnot3 not covered by VI and MODEF.
+(define_mode_iterator ANDNOT_MODE [TF V1TI])
+
+(define_insn "*andnot<mode>3"
+  [(set (match_operand:ANDNOT_MODE 0 "register_operand" "=x,x,v,v")
+	(and:ANDNOT_MODE
+	  (not:ANDNOT_MODE (match_operand:ANDNOT_MODE 1 "register_operand" "0,x,v,v"))
+	  (match_operand:ANDNOT_MODE 2 "vector_operand" "xBm,xm,vm,v")))]
   "TARGET_SSE"
 {
   char buf[128];
@@ -5989,7 +6082,7 @@
 	      (match_operand:VFH_128 1 "register_operand" "0,0")
 	      (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")
 	      (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))
-	    (match_operand:VFH_128 4 "const0_operand" "C,C")
+	    (match_operand:VFH_128 4 "const0_operand")
 	    (match_operand:QI 5 "register_operand" "Yk,Yk"))
 	  (match_dup 1)
 	  (const_int 1)))]
@@ -6047,7 +6140,7 @@
 	      (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")
 	      (neg:VFH_128
 		(match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")))
-	    (match_operand:VFH_128 4 "const0_operand" "C,C")
+	    (match_operand:VFH_128 4 "const0_operand")
 	    (match_operand:QI 5 "register_operand" "Yk,Yk"))
 	  (match_dup 1)
 	  (const_int 1)))]
@@ -6119,7 +6212,7 @@
 		(match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v"))
 	      (match_operand:VFH_128 1 "register_operand" "0,0")
 	      (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))
-	    (match_operand:VFH_128 4 "const0_operand" "C,C")
+	    (match_operand:VFH_128 4 "const0_operand")
 	    (match_operand:QI 5 "register_operand" "Yk,Yk"))
 	  (match_dup 1)
 	  (const_int 1)))]
@@ -6180,7 +6273,7 @@
 	      (match_operand:VFH_128 1 "register_operand" "0,0")
 	      (neg:VFH_128
 		(match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")))
-	    (match_operand:VFH_128 4 "const0_operand" "C,C")
+	    (match_operand:VFH_128 4 "const0_operand")
 	    (match_operand:QI 5 "register_operand" "Yk,Yk"))
 	  (match_dup 1)
 	  (const_int 1)))]
@@ -6310,7 +6403,7 @@
    (match_operand:<avx512fmaskcmode> 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx op0, op1, dest;
   if (<round_embedded_complex>)
     emit_insn (gen_<avx512>_fmaddc_<mode>_mask<round_expand_name> (
       operands[0], operands[1], operands[2], operands[3],
@@ -6319,9 +6412,16 @@
     emit_insn (gen_<avx512>_fmaddc_<mode>_mask (operands[0],
       operands[1], operands[2], operands[3], operands[4]));
 
-  op0 = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode);
+  op0 = lowpart_subreg (<ssePSmode>mode,
+			force_reg (<MODE>mode, operands[0]),
+			<MODE>mode);
+  dest = gen_reg_rtx (<ssePSmode>mode);
+  if (!MEM_P (operands[1]))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
   op1 = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode);
-  emit_insn (gen_<complexmove>_mask (op0, op0, op1, operands[4]));
+  emit_insn (gen_<complexmove>_mask (dest, op0, op1, operands[4]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, dest, <ssePSmode>mode));
   DONE;
 })
 
@@ -6347,7 +6447,7 @@
    (match_operand:<avx512fmaskcmode> 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx op0, op1, dest;
   if (<round_embedded_complex>)
     emit_insn (gen_<avx512>_fcmaddc_<mode>_mask<round_expand_name> (
       operands[0], operands[1], operands[2], operands[3],
@@ -6358,9 +6458,16 @@
       operands[1], operands[2], operands[3], operands[4]));
   }
 
-  op0 = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode);
+  op0 = lowpart_subreg (<ssePSmode>mode,
+			force_reg (<MODE>mode, operands[0]),
+			<MODE>mode);
+  dest = gen_reg_rtx (<ssePSmode>mode);
+  if (!MEM_P (operands[1]))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
   op1 = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode);
-  emit_insn (gen_<complexmove>_mask (op0, op0, op1, operands[4]));
+  emit_insn (gen_<complexmove>_mask (dest, op0, op1, operands[4]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, dest, <ssePSmode>mode));
   DONE;
 })
 
@@ -6470,17 +6577,23 @@
 	     (match_operand:<ssePSmode> 2 "bcst_vector_operand") 0)
 	   (match_operand:VF_AVX512FP16VL 3 "vector_operand")]
 	   UNSPEC_COMPLEX_FMA))]
-  "TARGET_AVX512FP16"
+  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
   "#"
   "&& 1"
-  [(set (match_dup 0)
-	(unspec:<ssePSmode>
-	  [(match_dup 1) (match_dup 2) (match_dup 3)]
-	   UNSPEC_COMPLEX_FMA_PAIR))]
+  [(const_int 0)]
   {
-    operands[0] = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode);
+    if (!MEM_P (operands[1]))
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+    if (!MEM_P (operands[3]))
+      operands[3] = force_reg (<MODE>mode, operands[3]);
     operands[1] = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode);
     operands[3] = lowpart_subreg (<ssePSmode>mode, operands[3], <MODE>mode);
+    rtx dest = gen_reg_rtx (<ssePSmode>mode);
+    emit_insn (gen_fma_fmaddc_<ssePSmodelower>_pair (dest, operands[1],
+						     operands[2], operands[3]));
+    emit_move_insn (operands[0],
+		    lowpart_subreg (<MODE>mode, dest, <ssePSmode>mode));
+    DONE;
   })
 
 (define_insn_and_split "fma_<mode>_fcmaddc_bcst"
@@ -6491,17 +6604,24 @@
 	     (match_operand:<ssePSmode> 2 "bcst_vector_operand") 0)
 	   (match_operand:VF_AVX512FP16VL 3 "vector_operand")]
 	   UNSPEC_COMPLEX_FCMA))]
-  "TARGET_AVX512FP16"
+  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
   "#"
   "&& 1"
-  [(set (match_dup 0)
-	(unspec:<ssePSmode>
-	  [(match_dup 1) (match_dup 2) (match_dup 3)]
-	   UNSPEC_COMPLEX_FCMA_PAIR))]
+  [(const_int 0)]
   {
-    operands[0] = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode);
+    if (!MEM_P (operands[1]))
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+    if (!MEM_P (operands[3]))
+      operands[3] = force_reg (<MODE>mode, operands[3]);
     operands[1] = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode);
     operands[3] = lowpart_subreg (<ssePSmode>mode, operands[3], <MODE>mode);
+    rtx dest = gen_reg_rtx (<ssePSmode>mode);
+    emit_insn (gen_fma_fcmaddc_<ssePSmodelower>_pair (dest, operands[1],
+						      operands[2],
+						      operands[3]));
+    emit_move_insn (operands[0],
+		    lowpart_subreg (<MODE>mode, dest, <ssePSmode>mode));
+    DONE;
   })
 
 (define_insn "<avx512>_<complexopname>_<mode>_mask<round_name>"
@@ -6567,7 +6687,7 @@
    (match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx op0, op1, dest;
 
   if (<round_embedded_complex>)
     emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> (
@@ -6577,26 +6697,15 @@
     emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0],
       operands[1], operands[2], operands[3], operands[4]));
 
-  if (TARGET_AVX512VL)
-  {
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
-  }
-  else
-  {
-    rtx mask, tmp, vec_mask;
-    mask = lowpart_subreg (SImode, operands[4], QImode),
-    tmp = gen_reg_rtx (SImode);
-    emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
-    vec_mask = gen_reg_rtx (V4SImode);
-    emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
-    emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
-    vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
-  }
+  op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]),
+			V8HFmode);
+  if (!MEM_P (operands[1]))
+    operands[1] = force_reg (V8HFmode, operands[1]);
+  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
+  dest = gen_reg_rtx (V4SFmode);
+  emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4]));
+  emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest,
+					       V4SFmode));
   DONE;
 })
 
@@ -6622,7 +6731,7 @@
    (match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx op0, op1, dest;
 
   if (<round_embedded_complex>)
     emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> (
@@ -6632,26 +6741,15 @@
     emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0],
       operands[1], operands[2], operands[3], operands[4]));
 
-  if (TARGET_AVX512VL)
-  {
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_avx512vl_loadv4sf_mask (op0, op0, op1, operands[4]));
-  }
-  else
-  {
-    rtx mask, tmp, vec_mask;
-    mask = lowpart_subreg (SImode, operands[4], QImode),
-    tmp = gen_reg_rtx (SImode);
-    emit_insn (gen_ashlsi3 (tmp, mask, GEN_INT (31)));
-    vec_mask = gen_reg_rtx (V4SImode);
-    emit_insn (gen_rtx_SET (vec_mask, CONST0_RTX (V4SImode)));
-    emit_insn (gen_vec_setv4si_0 (vec_mask, vec_mask, tmp));
-    vec_mask = lowpart_subreg (V4SFmode, vec_mask, V4SImode);
-    op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
-    op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
-    emit_insn (gen_sse4_1_blendvps (op0, op1, op0, vec_mask));
-  }
+  op0 = lowpart_subreg (V4SFmode, force_reg (V8HFmode, operands[0]),
+			V8HFmode);
+  if (!MEM_P (operands[1]))
+    operands[1] = force_reg (V8HFmode, operands[1]);
+  op1 = lowpart_subreg (V4SFmode, operands[1], V8HFmode);
+  dest = gen_reg_rtx (V4SFmode);
+  emit_insn (gen_avx512f_movsf_mask (dest, op1, op0, op1, operands[4]));
+  emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest,
+					       V4SFmode));
   DONE;
 })
 
@@ -6663,7 +6761,7 @@
    (match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx dest, op0, op1;
 
   if (<round_embedded_complex>)
     emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask<round_expand_name> (
@@ -6673,9 +6771,15 @@
     emit_insn (gen_avx512fp16_fcmaddcsh_v8hf_mask (operands[0],
       operands[1], operands[2], operands[3], operands[4]));
 
-  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
+  dest = gen_reg_rtx (V4SFmode);
+  op0 = lowpart_subreg (V4SFmode,
+			force_reg (V8HFmode, operands[0]),
+			V8HFmode);
+  if (!MEM_P (operands[3]))
+    operands[3] = force_reg (V8HFmode, operands[3]);
   op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode);
-  emit_insn (gen_sse_movss (op0, op1, op0));
+  emit_insn (gen_sse_movss (dest, op1, op0));
+  emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode));
   DONE;
 })
 
@@ -6687,7 +6791,7 @@
    (match_operand:QI 4 "register_operand")]
   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
 {
-  rtx op0, op1;
+  rtx dest, op0, op1;
 
   if (<round_embedded_complex>)
     emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask<round_expand_name> (
@@ -6697,9 +6801,15 @@
     emit_insn (gen_avx512fp16_fmaddcsh_v8hf_mask (operands[0],
       operands[1], operands[2], operands[3], operands[4]));
 
-  op0 = lowpart_subreg (V4SFmode, operands[0], V8HFmode);
+  dest = gen_reg_rtx (V4SFmode);
+  op0 = lowpart_subreg (V4SFmode,
+			force_reg (V8HFmode, operands[0]),
+			V8HFmode);
+  if (!MEM_P (operands[3]))
+    operands[3] = force_reg (V8HFmode, operands[3]);
   op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode);
-  emit_insn (gen_sse_movss (op0, op1, op0));
+  emit_insn (gen_sse_movss (dest, op1, op0));
+  emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode));
   DONE;
 })
 
@@ -6714,7 +6824,7 @@
 	  (match_dup 2)
 	  (const_int 3)))]
   "TARGET_AVX512FP16"
-  "v<complexopname>sh\t{<round_scalarcz_mask_op4>%2, %1, %0<mask_scalarcz_operand4>|%0<mask_scalarcz_operand4>, %1, %2<round_scalarcz_maskcz_mask_op4>}"
+  "v<complexopname>sh\t{<round_scalarcz_mask_op4>%2, %1, %0<mask_scalarcz_operand4>|%0<mask_scalarcz_operand4>, %1, %2<round_scalarcz_mask_op4>}"
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "V8HF")])
 
@@ -6809,9 +6919,10 @@
 	  (match_operand:VI4_128_8_256 1 "vector_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
 {
-  operands[0] = lowpart_subreg (V8HFmode, operands[0], V4HFmode);
-  emit_insn (gen_avx512fp16_float<floatunssuffix><mode>v4hf2 (operands[0],
+  rtx dest = gen_reg_rtx (V8HFmode);
+  emit_insn (gen_avx512fp16_float<floatunssuffix><mode>v4hf2 (dest,
 							      operands[1]));
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, dest, V8HFmode));
   DONE;
 })
 
@@ -6827,7 +6938,7 @@
   [(set (match_operand:V8HF 0 "register_operand" "=v")
 	(vec_concat:V8HF
 	    (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
-	    (match_operand:V4HF 2 "const0_operand" "C")))]
+	    (match_operand:V4HF 2 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
@@ -6856,7 +6967,7 @@
                 (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
                 (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))
             (match_operand:QI 3 "register_operand" "Yk"))
-	    (match_operand:V4HF 4 "const0_operand" "C")))]
+	    (match_operand:V4HF 4 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
   [(set_attr "type" "ssecvt")
@@ -6869,9 +6980,9 @@
 	(vec_merge:V4HF
 		(any_float:V4HF (match_operand:VI4_128_8_256 1
 				  "vector_operand" "vm"))
-	    (match_operand:V4HF 3 "const0_operand" "C")
+	    (match_operand:V4HF 3 "const0_operand")
 	    (match_operand:QI 2 "register_operand" "Yk"))
-	    (match_operand:V4HF 4 "const0_operand" "C")))]
+	    (match_operand:V4HF 4 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
   [(set_attr "type" "ssecvt")
@@ -6884,9 +6995,10 @@
 	  (match_operand:V2DI 1 "vector_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
 {
-  operands[0] = lowpart_subreg (V8HFmode, operands[0], V2HFmode);
-  emit_insn (gen_avx512fp16_float<floatunssuffix>v2div2hf2 (operands[0],
+  rtx dest = gen_reg_rtx (V8HFmode);
+  emit_insn (gen_avx512fp16_float<floatunssuffix>v2div2hf2 (dest,
 							    operands[1]));
+  emit_move_insn (operands[0], lowpart_subreg (V2HFmode, dest, V8HFmode));
   DONE;
 })
 
@@ -6902,7 +7014,7 @@
   [(set (match_operand:V8HF 0 "register_operand" "=v")
 	(vec_concat:V8HF
 	    (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
-	    (match_operand:V6HF 2 "const0_operand" "C")))]
+	    (match_operand:V6HF 2 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvt<floatsuffix>qq2ph{x}\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
@@ -6931,7 +7043,7 @@
                 (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
                 (parallel [(const_int 0) (const_int 1)]))
             (match_operand:QI 3 "register_operand" "Yk"))
-	    (match_operand:V6HF 4 "const0_operand" "C")))]
+	    (match_operand:V6HF 4 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvt<floatsuffix>qq2ph{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
   [(set_attr "type" "ssecvt")
@@ -6944,9 +7056,9 @@
 	(vec_merge:V2HF
 		(any_float:V2HF (match_operand:V2DI 1
 				  "vector_operand" "vm"))
-	    (match_operand:V2HF 3 "const0_operand" "C")
+	    (match_operand:V2HF 3 "const0_operand")
 	    (match_operand:QI 2 "register_operand" "Yk"))
-	    (match_operand:V6HF 4 "const0_operand" "C")))]
+	    (match_operand:V6HF 4 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvt<floatsuffix>qq2ph{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
   [(set_attr "type" "ssecvt")
@@ -7017,7 +7129,9 @@
 {
   if (!MEM_P (operands[1]))
     {
-      operands[1] = lowpart_subreg (V8HFmode, operands[1], V4HFmode);
+      operands[1] = lowpart_subreg (V8HFmode,
+				    force_reg (V4HFmode, operands[1]),
+				    V4HFmode);
       emit_insn (gen_avx512fp16_fix<fixunssuffix>_trunc<mode>2 (operands[0],
 								operands[1]));
       DONE;
@@ -7054,7 +7168,9 @@
 {
   if (!MEM_P (operands[1]))
     {
-      operands[1] = lowpart_subreg (V8HFmode, operands[1], V2HFmode);
+      operands[1] = lowpart_subreg (V8HFmode,
+				    force_reg (V2HFmode, operands[1]),
+				    V2HFmode);
       emit_insn (gen_avx512fp16_fix<fixunssuffix>_truncv2di2 (operands[0],
 							      operands[1]));
       DONE;
@@ -7133,7 +7249,9 @@
 {
   if (!MEM_P (operands[1]))
     {
-      operands[1] = lowpart_subreg (V8HFmode, operands[1], V4HFmode);
+      operands[1] = lowpart_subreg (V8HFmode,
+				    force_reg (V4HFmode, operands[1]),
+				    V4HFmode);
       emit_insn (gen_avx512fp16_float_extend_ph<mode>2
 		 (operands[0], operands[1]));
       DONE;
@@ -7170,7 +7288,9 @@
 {
   if (!MEM_P (operands[1]))
     {
-      operands[1] = lowpart_subreg (V8HFmode, operands[1], V2HFmode);
+      operands[1] = lowpart_subreg (V8HFmode,
+				    force_reg (V2HFmode, operands[1]),
+				    V2HFmode);
       emit_insn (gen_avx512fp16_float_extend_phv2df2
 		 (operands[0], operands[1]));
       DONE;
@@ -7220,8 +7340,10 @@
 	(float_truncate:V4HF (match_operand:VF4_128_8_256 1 "vector_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
 {
-  operands[0] = lowpart_subreg (V8HFmode, operands[0], V4HFmode);
-  emit_insn (gen_avx512fp16_trunc<mode>v4hf2 (operands[0], operands[1]));
+  rtx dest = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_avx512fp16_trunc<mode>v4hf2 (dest, operands[1]));
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, dest, V8HFmode));
   DONE;
 })
 
@@ -7239,7 +7361,7 @@
 	(vec_concat:V8HF
 	    (float_truncate:V4HF
 	      (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
-	    (match_operand:V4HF 2 "const0_operand" "C")))]
+	    (match_operand:V4HF 2 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
@@ -7272,7 +7394,7 @@
 	      (parallel [(const_int 0) (const_int 1)
 			 (const_int 2) (const_int 3)]))
 	    (match_operand:QI 3 "register_operand" "Yk"))
-	  (match_operand:V4HF 4 "const0_operand" "C")))]
+	  (match_operand:V4HF 4 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
   [(set_attr "type" "ssecvt")
@@ -7285,9 +7407,9 @@
 	  (vec_merge:V4HF
 	    (float_truncate:V4HF
 	      (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
-	    (match_operand:V4HF 3 "const0_operand" "C")
+	    (match_operand:V4HF 3 "const0_operand")
 	    (match_operand:QI 2 "register_operand" "Yk"))
-	  (match_operand:V4HF 4 "const0_operand" "C")))]
+	  (match_operand:V4HF 4 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
   [(set_attr "type" "ssecvt")
@@ -7299,8 +7421,9 @@
 	(float_truncate:V2HF (match_operand:V2DF 1 "vector_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
 {
-  operands[0] = lowpart_subreg (V8HFmode, operands[0], V2HFmode);
-  emit_insn (gen_avx512fp16_truncv2dfv2hf2 (operands[0], operands[1]));
+  rtx dest = gen_reg_rtx (V8HFmode);
+  emit_insn (gen_avx512fp16_truncv2dfv2hf2 (dest, operands[1]));
+  emit_move_insn (operands[0], lowpart_subreg (V2HFmode, dest, V8HFmode));
   DONE;
 })
 
@@ -7317,7 +7440,7 @@
 	(vec_concat:V8HF
 	  (float_truncate:V2HF
 	    (match_operand:V2DF 1 "vector_operand" "vm"))
-	  (match_operand:V6HF 2 "const0_operand" "C")))]
+	  (match_operand:V6HF 2 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvtpd2ph{x}\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
@@ -7347,7 +7470,7 @@
 	      (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
 	      (parallel [(const_int 0) (const_int 1)]))
 	    (match_operand:QI 3 "register_operand" "Yk"))
-	  (match_operand:V6HF 4 "const0_operand" "C")))]
+	  (match_operand:V6HF 4 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvtpd2ph{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
   [(set_attr "type" "ssecvt")
@@ -7360,9 +7483,9 @@
 	  (vec_merge:V2HF
 	    (float_truncate:V2HF
 	      (match_operand:V2DF 1 "vector_operand" "vm"))
-	    (match_operand:V2HF 3 "const0_operand" "C")
+	    (match_operand:V2HF 3 "const0_operand")
 	    (match_operand:QI 2 "register_operand" "Yk"))
-	  (match_operand:V6HF 4 "const0_operand" "C")))]
+	  (match_operand:V6HF 4 "const0_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "vcvtpd2ph{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
   [(set_attr "type" "ssecvt")
@@ -8146,7 +8269,7 @@
   [(set (match_operand:V4SF 0 "register_operand" "=v")
 	(vec_concat:V4SF
 	    (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm"))
-	    (match_operand:V2SF 2 "const0_operand" "C")))]
+	    (match_operand:V2SF 2 "const0_operand")))]
   "TARGET_AVX512DQ && TARGET_AVX512VL"
   "vcvt<floatsuffix>qq2ps{x}\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
@@ -8221,7 +8344,7 @@
                 (match_operand:V4SF 2 "nonimm_or_0_operand" "0C")
                 (parallel [(const_int 0) (const_int 1)]))
             (match_operand:QI 3 "register_operand" "Yk"))
-	    (match_operand:V2SF 4 "const0_operand" "C")))]
+	    (match_operand:V2SF 4 "const0_operand")))]
   "TARGET_AVX512DQ && TARGET_AVX512VL"
   "vcvt<floatsuffix>qq2ps{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
   [(set_attr "type" "ssecvt")
@@ -8234,9 +8357,9 @@
 	(vec_merge:V2SF
 		(any_float:V2SF (match_operand:V2DI 1
 				  "nonimmediate_operand" "vm"))
-	    (match_operand:V2SF 3 "const0_operand" "C")
+	    (match_operand:V2SF 3 "const0_operand")
 	    (match_operand:QI 2 "register_operand" "Yk"))
-	    (match_operand:V2SF 4 "const0_operand" "C")))]
+	    (match_operand:V2SF 4 "const0_operand")))]
   "TARGET_AVX512DQ && TARGET_AVX512VL"
   "vcvt<floatsuffix>qq2ps{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
   [(set_attr "type" "ssecvt")
@@ -8860,7 +8983,7 @@
 	(vec_concat:V4SF
 	  (float_truncate:V2SF
 	    (match_operand:V2DF 1 "vector_operand" "vBm"))
-	  (match_operand:V2SF 2 "const0_operand" "C")))]
+	  (match_operand:V2SF 2 "const0_operand")))]
   "TARGET_SSE2"
 {
   if (TARGET_AVX)
@@ -8905,7 +9028,7 @@
 	      (match_operand:V4SF 2 "nonimm_or_0_operand" "0C")
 	      (parallel [(const_int 0) (const_int 1)]))
 	    (match_operand:QI 3 "register_operand" "Yk"))
-	  (match_operand:V2SF 4 "const0_operand" "C")))]
+	  (match_operand:V2SF 4 "const0_operand")))]
   "TARGET_AVX512VL"
   "vcvtpd2ps{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
   [(set_attr "type" "ssecvt")
@@ -8918,9 +9041,9 @@
 	  (vec_merge:V2SF
 	    (float_truncate:V2SF
 	      (match_operand:V2DF 1 "nonimmediate_operand" "vm"))
-	    (match_operand:V2SF 3 "const0_operand" "C")
+	    (match_operand:V2SF 3 "const0_operand")
 	    (match_operand:QI 2 "register_operand" "Yk"))
-	  (match_operand:V2SF 4 "const0_operand" "C")))]
+	  (match_operand:V2SF 4 "const0_operand")))]
   "TARGET_AVX512VL"
   "vcvtpd2ps{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
   [(set_attr "type" "ssecvt")
@@ -10455,7 +10578,7 @@
   [(set (match_operand:V4SF 0 "register_operand"       "=v")
 	(vec_concat:V4SF
 	  (match_operand:V2SF 1 "nonimmediate_operand" "vm")
-	  (match_operand:V2SF 2 "const0_operand"       " C")))]
+	  (match_operand:V2SF 2 "const0_operand")))]
   "TARGET_SSE2"
   "%vmovq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
@@ -10466,11 +10589,11 @@
 ;; see comment above inline_secondary_memory_needed function in i386.cc
 (define_insn "vec_set<mode>_0"
   [(set (match_operand:VI4F_128 0 "nonimmediate_operand"
-	  "=Yr,*x,v,v,v,x,x,v,Yr ,*x ,x  ,m ,m   ,m")
+	  "=Yr,*x,v,v,v,x,x,v,Yr ,?x ,x  ,m ,m   ,m")
 	(vec_merge:VI4F_128
 	  (vec_duplicate:VI4F_128
 	    (match_operand:<ssescalarmode> 2 "general_operand"
-	  " Yr,*x,v,m,r ,m,x,v,*rm,*rm,*rm,!x,!*re,!*fF"))
+	  " Yr,*x,v,m,r ,m,x,v,?rm,?rm,?rm,!x,?re,!*fF"))
 	  (match_operand:VI4F_128 1 "nonimm_or_0_operand"
 	  " C , C,C,C,C ,C,0,v,0  ,0  ,x  ,0 ,0   ,0")
 	  (const_int 1)))]
@@ -10563,9 +10686,9 @@
   [(set (attr "isa")
 	(cond [(eq_attr "alternative" "0,1,2")
 		 (const_string "avx512fp16")
-	       (eq_attr "alternative" "3")
+	       (eq_attr "alternative" "3,4")
 		 (const_string "noavx")
-	       (eq_attr "alternative" "4,5,6")
+	       (eq_attr "alternative" "5,6")
 		 (const_string "sse4_noavx")
 	       (eq_attr "alternative" "7,8,9")
 		 (const_string "avx")
@@ -10613,7 +10736,7 @@
 	(vec_merge:VI2F_256_512
 	  (vec_duplicate:VI2F_256_512
 	    (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,m"))
-	  (match_operand:VI2F_256_512 1 "const0_operand" "C,C")
+	  (match_operand:VI2F_256_512 1 "const0_operand")
 	  (const_int 1)))]
   "TARGET_AVX512FP16"
   "@
@@ -10623,6 +10746,46 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
 
+(define_insn_and_split "*vec_set<mode>_0_zero_extendhi"
+  [(set (match_operand:VI48_AVX512F 0 "register_operand")
+	(vec_merge:VI48_AVX512F
+	 (vec_duplicate:VI48_AVX512F
+	  (zero_extend:<ssescalarmode>
+	    (match_operand:HI 1 "nonimmediate_operand")))
+	 (match_operand:VI48_AVX512F 2 "const0_operand")
+	 (const_int 1)))]
+  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (<ssewvecmode>mode);
+  emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest,
+					      CONST0_RTX (<ssewvecmode>mode),
+					      operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode));
+  DONE;
+})
+
+(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(vec_concat:V2DI
+	  (zero_extend:DI
+	    (match_operand:HI 1 "nonimmediate_operand"))
+	  (const_int 0)))]
+  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (V8HImode);
+  emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (V2DImode, dest, V8HImode));
+  DONE;
+})
+
 (define_insn "avx512fp16_movsh"
   [(set (match_operand:V8HF 0 "register_operand" "=v")
 	(vec_merge:V8HF
@@ -10673,7 +10836,7 @@
 	(vec_merge:VI4F_256_512
 	  (vec_duplicate:VI4F_256_512
 	    (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "v,m,r"))
-	  (match_operand:VI4F_256_512 1 "const0_operand" "C,C,C")
+	  (match_operand:VI4F_256_512 1 "const0_operand")
 	  (const_int 1)))]
   "TARGET_AVX"
   "@
@@ -10692,11 +10855,51 @@
 	   ]
 	   (symbol_ref "true")))])
 
+(define_insn_and_split "*vec_set<mode>_0_zero_extendsi"
+  [(set (match_operand:VI8 0 "register_operand")
+	(vec_merge:VI8
+	 (vec_duplicate:VI8
+	  (zero_extend:DI
+	    (match_operand:SI 1 "nonimmediate_operand")))
+	 (match_operand:VI8 2 "const0_operand")
+	 (const_int 1)))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (<ssepackmode>mode);
+  emit_insn (gen_vec_set<ssepackmodelower>_0 (dest,
+					      CONST0_RTX (<ssepackmode>mode),
+					      operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode));
+  DONE;
+})
+
+(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(vec_concat:V2DI
+	  (zero_extend:DI
+	    (match_operand:SI 1 "nonimmediate_operand"))
+	  (const_int 0)))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dest = gen_reg_rtx (V4SImode);
+  emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), operands[1]));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (V2DImode, dest, V4SImode));
+  DONE;
+})
+
 (define_insn "sse4_1_insertps"
   [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
 	(unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "Yrm,*xm,vm")
 		      (match_operand:V4SF 1 "register_operand" "0,0,v")
-		      (match_operand:SI 3 "const_0_to_255_operand" "n,n,n")]
+		      (match_operand:SI 3 "const_0_to_255_operand")]
 		     UNSPEC_INSERTPS))]
   "TARGET_SSE4_1"
 {
@@ -10813,7 +11016,7 @@
   [(set (match_operand:SF 0 "nonimmediate_operand" "=rm,rm,rm,Yv,Yv")
 	(vec_select:SF
 	  (match_operand:V4SF 1 "register_operand" "Yr,*x,v,0,v")
-	  (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n,n,n,n,n")])))]
+	  (parallel [(match_operand:SI 2 "const_0_to_3_operand")])))]
   "TARGET_SSE4_1"
   "@
    extractps\t{%2, %1, %0|%0, %1, %2}
@@ -10852,10 +11055,10 @@
    (set_attr "mode" "V4SF,V4SF,V4SF,*,*")])
 
 (define_insn_and_split "*vec_extractv4sf_mem"
-  [(set (match_operand:SF 0 "register_operand" "=v,*r,f")
+  [(set (match_operand:SF 0 "register_operand" "=v,?r,f")
 	(vec_select:SF
 	  (match_operand:V4SF 1 "memory_operand" "o,o,o")
-	  (parallel [(match_operand 2 "const_0_to_3_operand" "n,n,n")])))]
+	  (parallel [(match_operand 2 "const_0_to_3_operand")])))]
   "TARGET_SSE"
   "#"
   "&& reload_completed"
@@ -11729,7 +11932,7 @@
   "operands[1] = gen_lowpart (HFmode, operands[1]);")
 
 (define_insn "*vec_extracthf"
-  [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=*r,m,x,v")
+  [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=?r,m,x,v")
 	(vec_select:HF
 	  (match_operand:V8HF 1 "register_operand" "v,v,0,v")
 	  (parallel
@@ -13338,7 +13541,7 @@
 	(vec_merge:VF2_512_256
 	  (vec_duplicate:VF2_512_256
 	    (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "vm"))
-	  (match_operand:VF2_512_256 1 "const0_operand" "C")
+	  (match_operand:VF2_512_256 1 "const0_operand")
 	  (const_int 1)))]
   "TARGET_AVX"
   "vmovq\t{%2, %x0|%x0, %2}"
@@ -13397,7 +13600,9 @@
   "&& 1"
   [(set (match_dup 0)
 	(truncate:V16HI (match_dup 1)))]
-  "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")
+  "operands[1] = lowpart_subreg (V16SImode,
+				 force_reg (V32HImode, operands[1]),
+				  V32HImode);")
 
 (define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1_hf"
   [(set (match_operand:V16HF 0 "nonimmediate_operand")
@@ -13421,8 +13626,18 @@
   [(set (match_dup 0)
 	(truncate:V16HI (match_dup 1)))]
 {
-  operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);
-  operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);
+  operands[1] = lowpart_subreg (V16SImode,
+				force_reg (V32HImode, operands[1]),
+				V32HImode);
+  if (MEM_P (operands[0]))
+    operands[0] = lowpart_subreg (V16HImode, operands[0], V16HFmode);
+  else
+    {
+      rtx op0 = gen_reg_rtx (V16HImode);
+      emit_insn (gen_truncv16siv16hi2 (op0, operands[1]));
+      emit_move_insn (operands[0], lowpart_subreg (V16HFmode, op0, V16HImode));
+      DONE;
+    }
 })
 
 
@@ -13442,7 +13657,9 @@
   "&& 1"
   [(set (match_dup 0)
 	(truncate:V8HI (match_dup 1)))]
-  "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")
+  "operands[1] = lowpart_subreg (V8SImode,
+				 force_reg (V16HImode, operands[1]),
+				 V16HImode);")
 
 (define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1_hf"
   [(set (match_operand:V8HF 0 "nonimmediate_operand")
@@ -13462,8 +13679,18 @@
   [(set (match_dup 0)
 	(truncate:V8HI (match_dup 1)))]
 {
-  operands[0] = lowpart_subreg (V8HImode, operands[0], V8HFmode);
-  operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);
+  operands[1] = lowpart_subreg (V8SImode,
+				force_reg (V16HImode, operands[1]),
+				V16HImode);
+  if (MEM_P (operands[0]))
+    operands[0] = lowpart_subreg (V8HImode, operands[0], V8HFmode);
+  else
+    {
+      rtx op0 = gen_reg_rtx (V8HImode);
+      emit_insn (gen_truncv8siv8hi2 (op0, operands[1]));
+      emit_move_insn (operands[0], lowpart_subreg (V8HFmode, op0, V8HImode));
+      DONE;
+    }
 })
 
 (define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"
@@ -13482,7 +13709,9 @@
   "&& 1"
   [(set (match_dup 0)
 	(truncate:V8SI (match_dup 1)))]
-  "operands[1] = lowpart_subreg (V8DImode, operands[1], V16SImode);")
+  "operands[1] = lowpart_subreg (V8DImode,
+				 force_reg (V16SImode, operands[1]),
+				 V16SImode);")
 
 (define_insn "avx512f_<code><pmov_src_lower><mode>2_mask"
   [(set (match_operand:PMOV_DST_MODE_1 0 "nonimmediate_operand" "=v,m")
@@ -13552,7 +13781,9 @@
   "&& 1"
   [(set (match_dup 0)
 	(truncate:V32QI (match_dup 1)))]
-  "operands[1] = lowpart_subreg (V32HImode, operands[1], V64QImode);")
+  "operands[1] = lowpart_subreg (V32HImode,
+				 force_reg (V64QImode, operands[1]),
+				 V64QImode);")
 
 (define_insn "avx512bw_<code>v32hiv32qi2_mask"
   [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m")
@@ -13620,7 +13851,9 @@
   "&& 1"
   [(set (match_dup 0)
 	(truncate:V16QI (match_dup 1)))]
-  "operands[1] = lowpart_subreg (V16HImode, operands[1], V32QImode);")
+  "operands[1] = lowpart_subreg (V16HImode,
+				 force_reg (V32QImode, operands[1]),
+				 V32QImode);")
 
 (define_insn_and_split "*avx512f_permvar_truncv4div4si_1"
   [(set (match_operand:V4SI 0 "nonimmediate_operand")
@@ -13636,7 +13869,9 @@
   "&& 1"
   [(set (match_dup 0)
 	(truncate:V4SI (match_dup 1)))]
-  "operands[1] = lowpart_subreg (V4DImode, operands[1], V8SImode);")
+  "operands[1] = lowpart_subreg (V4DImode,
+				 force_reg (V8SImode, operands[1]),
+				 V8SImode);")
 
 (define_insn "<avx512>_<code><ssedoublemodelower><mode>2_mask"
   [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m")
@@ -13714,7 +13949,9 @@
   [(const_int 0)]
 {
   rtx op1 = gen_reg_rtx (V8QImode);
-  operands[1] = lowpart_subreg (V8HImode, operands[1], V16QImode);
+  operands[1] = lowpart_subreg (V8HImode,
+				force_reg (V16QImode, operands[1]),
+				V16QImode);
   emit_insn (gen_truncv8hiv8qi2 (op1, operands[1]));
   emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V8QImode));
   DONE;
@@ -14092,7 +14329,9 @@
   [(const_int 0)]
 {
   rtx op1 = gen_reg_rtx (V4HImode);
-  operands[1] = lowpart_subreg (V4SImode, operands[1], V16QImode);
+  operands[1] = lowpart_subreg (V4SImode,
+				force_reg (V16QImode, operands[1]),
+				V16QImode);
   emit_insn (gen_truncv4siv4hi2 (op1, operands[1]));
   emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V4HImode));
   DONE;
@@ -14339,7 +14578,9 @@
   [(const_int 0)]
 {
   rtx op1 = gen_reg_rtx (V2SImode);
-  operands[1] = lowpart_subreg (V2DImode, operands[1], V4SImode);
+  operands[1] = lowpart_subreg (V2DImode,
+				force_reg (V4SImode, operands[1]),
+				V4SImode);
   emit_insn (gen_truncv2div2si2 (op1, operands[1]));
   emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V2SImode));
   DONE;
@@ -15373,8 +15614,12 @@
       || (<MODE_SIZE> < 64
 	  && ((TARGET_AVX512VNNI && TARGET_AVX512VL) || TARGET_AVXVNNI)))
     {
-      operands[1] = lowpart_subreg (<sseunpackmode>mode, operands[1], <MODE>mode);
-      operands[2] = lowpart_subreg (<sseunpackmode>mode, operands[2], <MODE>mode);
+      operands[1] = lowpart_subreg (<sseunpackmode>mode,
+				    force_reg (<MODE>mode, operands[1]),
+				    <MODE>mode);
+      operands[2] = lowpart_subreg (<sseunpackmode>mode,
+				    force_reg (<MODE>mode, operands[2]),
+				    <MODE>mode);
       emit_insn (gen_rtx_SET (operands[0], operands[3]));
       emit_insn (gen_vpdpwssd_<SDOT_VPDP_SUF> (operands[0], operands[3],
 					       operands[1], operands[2]));
@@ -15598,8 +15843,9 @@
 	      sel[7] = 15;
 	    }
 	  vec_perm_indices indices (sel, 2, 8);
-	  bool ok = targetm.vectorize.vec_perm_const (V8SImode, target,
-						      arg0, arg1, indices);
+	  bool ok = targetm.vectorize.vec_perm_const (V8SImode, V8SImode,
+						      target, arg0, arg1,
+						      indices);
 	  gcc_assert (ok);
 	  emit_move_insn (operands[0],
 			  lowpart_subreg (V4DImode, target, V8SImode));
@@ -15783,7 +16029,7 @@
   [(set (match_operand:VIMAX_AVX512VL 0 "register_operand" "=v")
 	(any_lshift:VIMAX_AVX512VL
 	 (match_operand:VIMAX_AVX512VL 1 "nonimmediate_operand" "vm")
-	 (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))]
+	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))]
   "TARGET_AVX512BW"
 {
   operands[2] = GEN_INT (INTVAL (operands[2]) / 8);
@@ -15798,7 +16044,7 @@
   [(set (match_operand:VIMAX_AVX2 0 "register_operand" "=x,Yw")
 	(any_lshift:VIMAX_AVX2
 	 (match_operand:VIMAX_AVX2 1 "register_operand" "0,Yw")
-	 (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))]
+	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))]
   "TARGET_SSE2"
 {
   operands[2] = GEN_INT (INTVAL (operands[2]) / 8);
@@ -16642,17 +16888,6 @@
 	  (match_operand:<avx512fmaskmode> 4 "register_operand")))]
   "TARGET_AVX512F")
 
-(define_expand "<sse2_avx2>_andnot<mode>3_mask"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
-	(vec_merge:VI12_AVX512VL
-	  (and:VI12_AVX512VL
-	    (not:VI12_AVX512VL
-	      (match_operand:VI12_AVX512VL 1 "register_operand"))
-	    (match_operand:VI12_AVX512VL 2 "nonimmediate_operand"))
-	  (match_operand:VI12_AVX512VL 3 "nonimm_or_0_operand")
-	  (match_operand:<avx512fmaskmode> 4 "register_operand")))]
-  "TARGET_AVX512BW")
-
 (define_insn "*andnot<mode>3"
   [(set (match_operand:VI 0 "register_operand" "=x,x,v")
 	(and:VI
@@ -17868,7 +18103,7 @@
 	  (match_operand:AVX512_VEC 1 "reg_or_0_operand" "v,C,C")
 	  (vec_duplicate:AVX512_VEC
 		(match_operand:<ssequartermode> 2 "nonimmediate_operand" "vm,xm,vm"))
-	  (match_operand:SI 3 "const_int_operand" "n,n,n")))]
+	  (match_operand:SI 3 "const_int_operand")))]
   "TARGET_AVX512F
    && (INTVAL (operands[3])
        == (GET_MODE_UNIT_SIZE (<MODE>mode) == 4 ? 0xFFF0 : 0xFC))"
@@ -17916,7 +18151,7 @@
 	  (match_operand:AVX512_VEC 1 "register_operand" "v")
 	  (vec_duplicate:AVX512_VEC
 		(match_operand:<ssequartermode> 2 "nonimmediate_operand" "vm"))
-	  (match_operand:SI 3 "const_int_operand" "n")))]
+	  (match_operand:SI 3 "const_int_operand")))]
   "TARGET_AVX512F"
 {
   int mask;
@@ -18584,7 +18819,7 @@
   [(set (match_operand:V32HI 0 "register_operand" "=v")
 	(unspec:V32HI
 	  [(match_operand:V32HI 1 "nonimmediate_operand" "vm")
-	   (match_operand:SI 2 "const_0_to_255_operand" "n")]
+	   (match_operand:SI 2 "const_0_to_255_operand")]
 	  UNSPEC_PSHUFLW))]
   "TARGET_AVX512BW"
   "vpshuflw\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
@@ -18760,7 +18995,7 @@
   [(set (match_operand:V32HI 0 "register_operand" "=v")
 	(unspec:V32HI
 	  [(match_operand:V32HI 1 "nonimmediate_operand" "vm")
-	   (match_operand:SI 2 "const_0_to_255_operand" "n")]
+	   (match_operand:SI 2 "const_0_to_255_operand")]
 	  UNSPEC_PSHUFHW))]
   "TARGET_AVX512BW"
   "vpshufhw\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
@@ -18932,6 +19167,26 @@
 (define_mode_iterator PEXTR_MODE12
   [(V16QI "TARGET_SSE4_1") V8HI])
 
+(define_insn_and_split "*vec_extract<mode>_0_mem"
+  [(set (match_operand:<ssescalarmode> 0 "memory_operand")
+	(vec_select:<ssescalarmode>
+	  (match_operand:PEXTR_MODE12 1 "register_operand")
+	  (parallel [(const_int 0)])))]
+  "TARGET_SSE2
+   && !TARGET_SSE4_1
+   && (TARGET_INTER_UNIT_MOVES_FROM_VEC
+       || optimize_function_for_speed_p (cfun))
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0) (match_dup 4))]
+{
+  operands[2] = gen_reg_rtx (SImode);
+  operands[3] = gen_lowpart (SImode, force_reg (<MODE>mode, operands[1]));
+  operands[4] = gen_lowpart (<ssescalarmode>mode, operands[2]);
+})
+
 (define_insn "*vec_extract<mode>"
   [(set (match_operand:<ssescalarmode> 0 "register_sse4nonimm_operand" "=r,m")
 	(vec_select:<ssescalarmode>
@@ -19397,11 +19652,11 @@
    (set_attr "type" "sselog,ssemov,sselog,ssemov,mmxcvt,mmxmov")
    (set_attr "mode" "TI,TI,V4SF,SF,DI,DI")])
 
-(define_insn "*vec_concatv4si"
-  [(set (match_operand:V4SI 0 "register_operand"       "=x,v,x,x,v")
-	(vec_concat:V4SI
-	  (match_operand:V2SI 1 "register_operand"     " 0,v,0,0,v")
-	  (match_operand:V2SI 2 "nonimmediate_operand" " x,v,x,m,m")))]
+(define_insn "*vec_concat<mode>"
+  [(set (match_operand:VI124_128 0 "register_operand"       "=x,v,x,x,v")
+	(vec_concat:VI124_128
+	  (match_operand:<ssehalfvecmode> 1 "register_operand"     " 0,v,0,0,v")
+	  (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,x,m,m")))]
   "TARGET_SSE"
   "@
    punpcklqdq\t{%2, %0|%0, %2}
@@ -19414,11 +19669,65 @@
    (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
    (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
 
+(define_insn_and_split "*vec_concatv16qi_permt2"
+  [(set (match_operand:V16QI 0 "register_operand")
+	(unspec:V16QI
+	  [(const_vector:V16QI [(const_int 0) (const_int 1)
+				(const_int 2) (const_int 3)
+				(const_int 4) (const_int 5)
+				(const_int 6) (const_int 7)
+				(const_int 16) (const_int 17)
+				(const_int 18) (const_int 19)
+				(const_int 20) (const_int 21)
+				(const_int 22) (const_int 23)])
+	   (match_operand:V16QI 1 "register_operand")
+	   (match_operand:V16QI 2 "nonimmediate_operand")]
+	  UNSPEC_VPERMT2))]
+  "TARGET_AVX512VL && TARGET_AVX512VBMI
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(vec_concat:V16QI (match_dup 1) (match_dup 2)))]
+{
+  operands[1] = lowpart_subreg (V8QImode,
+				force_reg (V16QImode, operands[1]),
+				V16QImode);
+  if (!MEM_P (operands[2]))
+    operands[2] = force_reg (V16QImode, operands[2]);
+  operands[2] = lowpart_subreg (V8QImode, operands[2], V16QImode);
+})
+
+(define_insn_and_split "*vec_concatv8hi_permt2"
+  [(set (match_operand:V8HI 0 "register_operand")
+	(unspec:V8HI
+	  [(const_vector:V8HI [(const_int 0) (const_int 1)
+				(const_int 2) (const_int 3)
+				(const_int 8) (const_int 9)
+				(const_int 10) (const_int 11)])
+	   (match_operand:V8HI 1 "register_operand")
+	   (match_operand:V8HI 2 "nonimmediate_operand")]
+	  UNSPEC_VPERMT2))]
+  "TARGET_AVX512VL && TARGET_AVX512BW
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(vec_concat:V8HI (match_dup 1) (match_dup 2)))]
+{
+  operands[1] = lowpart_subreg (V4HImode,
+				force_reg (V8HImode, operands[1]),
+				V8HImode);
+  if (!MEM_P (operands[2]))
+    operands[2] = force_reg (V8HImode, operands[2]);
+  operands[2] = lowpart_subreg (V4HImode, operands[2], V8HImode);
+})
+
 (define_insn "*vec_concat<mode>_0"
   [(set (match_operand:VI124_128 0 "register_operand"       "=v,x")
 	(vec_concat:VI124_128
 	  (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm,?!*y")
-	  (match_operand:<ssehalfvecmode> 2 "const0_operand"       " C,C")))]
+	  (match_operand:<ssehalfvecmode> 2 "const0_operand")))]
   "TARGET_SSE2"
   "@
    %vmovq\t{%1, %0|%0, %1}
@@ -19492,7 +19801,7 @@
   [(set (match_operand:V2DI 0 "register_operand"     "=v,v ,x")
 	(vec_concat:V2DI
 	  (match_operand:DI 1 "nonimmediate_operand" " r,vm,?!*y")
-	  (match_operand:DI 2 "const0_operand"       " C,C ,C")))]
+	  (match_operand:DI 2 "const0_operand")))]
   "TARGET_SSE2"
   "@
    * return HAVE_AS_IX86_INTERUNIT_MOVQ ? \"%vmovq\t{%1, %0|%0, %1}\" : \"%vmovd\t{%1, %0|%0, %1}\";
@@ -19516,7 +19825,7 @@
 	(vec_merge:VI8_AVX_AVX512F
 	  (vec_duplicate:VI8_AVX_AVX512F
 	    (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,vm"))
-	  (match_operand:VI8_AVX_AVX512F 1 "const0_operand" "C,C")
+	  (match_operand:VI8_AVX_AVX512F 1 "const0_operand")
 	  (const_int 1)))]
   "TARGET_AVX"
   "vmovq\t{%2, %x0|%x0, %2}"
@@ -19672,13 +19981,23 @@
 
 ;; The correct representation for this is absolutely enormous, and
 ;; surely not generally useful.
-(define_insn "<sse2_avx2>_psadbw"
+(define_expand "<sse2_avx2>_psadbw"
+  [(set (match_operand:VI8_AVX2_AVX512BW 0 "register_operand")
+	(unspec:VI8_AVX2_AVX512BW
+	  [(match_operand:<ssebytemode> 1 "vector_operand")
+	   (match_operand:<ssebytemode> 2 "vector_operand")]
+	  UNSPEC_PSADBW))]
+  "TARGET_SSE2"
+  "ix86_fixup_binary_operands_no_copy (PLUS, <ssebytemode>mode, operands);")
+
+(define_insn "*<sse2_avx2>_psadbw"
   [(set (match_operand:VI8_AVX2_AVX512BW 0 "register_operand" "=x,YW")
 	(unspec:VI8_AVX2_AVX512BW
-	  [(match_operand:<ssebytemode> 1 "register_operand" "0,YW")
+	  [(match_operand:<ssebytemode> 1 "vector_operand" "%0,YW")
 	   (match_operand:<ssebytemode> 2 "vector_operand" "xBm,YWm")]
 	  UNSPEC_PSADBW))]
-  "TARGET_SSE2"
+  "TARGET_SSE2
+   && ix86_binary_operator_ok (PLUS, <ssebytemode>mode, operands)"
   "@
    psadbw\t{%2, %0|%0, %2}
    vpsadbw\t{%2, %1, %0|%0, %1, %2}"
@@ -19717,7 +20036,7 @@
 	(unspec:SI
 	  [(lt:VF_128_256
 	     (match_operand:<sseintvecmode> 1 "register_operand" "x")
-	     (match_operand:<sseintvecmode> 2 "const0_operand" "C"))]
+	     (match_operand:<sseintvecmode> 2 "const0_operand"))]
 	  UNSPEC_MOVMSK))]
   "TARGET_SSE"
   "#"
@@ -19735,7 +20054,7 @@
 	  (unspec:SI
 	    [(lt:VF_128_256
 	       (match_operand:<sseintvecmode> 1 "register_operand" "x")
-	       (match_operand:<sseintvecmode> 2 "const0_operand" "C"))]
+	       (match_operand:<sseintvecmode> 2 "const0_operand"))]
 	    UNSPEC_MOVMSK)))]
   "TARGET_64BIT && TARGET_SSE"
   "#"
@@ -19753,7 +20072,7 @@
 	  [(subreg:VF_128_256
 	     (ashiftrt:<sseintvecmode>
 	       (match_operand:<sseintvecmode> 1 "register_operand" "x")
-	       (match_operand:QI 2 "const_int_operand" "n")) 0)]
+	       (match_operand:QI 2 "const_int_operand")) 0)]
 	  UNSPEC_MOVMSK))]
   "TARGET_SSE"
   "#"
@@ -19772,7 +20091,7 @@
 	    [(subreg:VF_128_256
 	       (ashiftrt:<sseintvecmode>
 		 (match_operand:<sseintvecmode> 1 "register_operand" "x")
-	       (match_operand:QI 2 "const_int_operand" "n")) 0)]
+	       (match_operand:QI 2 "const_int_operand")) 0)]
 	    UNSPEC_MOVMSK)))]
   "TARGET_64BIT && TARGET_SSE"
   "#"
@@ -19913,7 +20232,7 @@
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(unspec:SI
 	  [(lt:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand" "x")
-			(match_operand:VI1_AVX2 2 "const0_operand" "C"))]
+			(match_operand:VI1_AVX2 2 "const0_operand"))]
 	  UNSPEC_MOVMSK))]
   "TARGET_SSE2"
   "#"
@@ -19935,7 +20254,7 @@
 	(zero_extend:DI
 	  (unspec:SI
 	    [(lt:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand" "x")
-			  (match_operand:VI1_AVX2 2 "const0_operand" "C"))]
+			  (match_operand:VI1_AVX2 2 "const0_operand"))]
 	    UNSPEC_MOVMSK)))]
   "TARGET_64BIT && TARGET_SSE2"
   "#"
@@ -19957,7 +20276,7 @@
 	(sign_extend:DI
 	  (unspec:SI
 	    [(lt:V16QI (match_operand:V16QI 1 "register_operand" "x")
-		       (match_operand:V16QI 2 "const0_operand" "C"))]
+		       (match_operand:V16QI 2 "const0_operand"))]
 	    UNSPEC_MOVMSK)))]
   "TARGET_64BIT && TARGET_SSE2"
   "#"
@@ -19974,6 +20293,24 @@
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SI")])
 
+;; Optimize pxor/pcmpeqb/pmovmskb/cmp 0xffff to ptest.
+(define_mode_attr vi1avx2const
+  [(V32QI "0xffffffff") (V16QI "0xffff")])
+
+(define_split
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ (unspec:SI
+			[(eq:VI1_AVX2
+			    (match_operand:VI1_AVX2 0 "vector_operand")
+			    (match_operand:VI1_AVX2 1 "const0_operand"))]
+		    UNSPEC_MOVMSK)
+		 (match_operand 2 "const_int_operand")))]
+  "TARGET_SSE4_1 && (INTVAL (operands[2]) == (int) (<vi1avx2const>))"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_dup 0)
+		    (match_dup 0)]
+		   UNSPEC_PTEST))])
+
 (define_expand "sse2_maskmovdqu"
   [(set (match_operand:V16QI 0 "memory_operand")
 	(unspec:V16QI [(match_operand:V16QI 1 "register_operand")
@@ -20125,12 +20462,12 @@
    (set_attr "mode" "TI")])
 
 (define_insn_and_split "ssse3_ph<plusminus_mnemonic>wv4hi3"
-  [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv")
+  [(set (match_operand:V4HI 0 "register_operand" "=y,x,x")
 	(ssse3_plusminus:V4HI
 	  (vec_select:V4HI
 	    (vec_concat:V8HI
-	      (match_operand:V4HI 1 "register_operand" "0,0,Yv")
-	      (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yv"))
+	      (match_operand:V4HI 1 "register_operand" "0,0,x")
+	      (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,x"))
 	    (parallel
 	      [(const_int 0) (const_int 2) (const_int 4) (const_int 6)]))
 	  (vec_select:V4HI
@@ -20212,12 +20549,12 @@
    (set_attr "mode" "TI")])
 
 (define_insn_and_split "ssse3_ph<plusminus_mnemonic>dv2si3"
-  [(set (match_operand:V2SI 0 "register_operand" "=y,x,Yv")
+  [(set (match_operand:V2SI 0 "register_operand" "=y,x,x")
 	(plusminus:V2SI
 	  (vec_select:V2SI
 	    (vec_concat:V4SI
-	      (match_operand:V2SI 1 "register_operand" "0,0,Yv")
-	      (match_operand:V2SI 2 "register_mmxmem_operand" "ym,x,Yv"))
+	      (match_operand:V2SI 1 "register_operand" "0,0,x")
+	      (match_operand:V2SI 2 "register_mmxmem_operand" "ym,x,x"))
 	    (parallel [(const_int 0) (const_int 2)]))
 	  (vec_select:V2SI
 	    (vec_concat:V4SI (match_dup 1) (match_dup 2))
@@ -20662,13 +20999,13 @@
 })
 
 (define_insn_and_split "*ssse3_pshufbv8qi3"
-  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv")
-	(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0,0,Yv")
-		      (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv")
+  [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw")
+	(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0,0,Yw")
+		      (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yw")
 		      (match_operand:V4SI 4 "reg_or_const_vector_operand"
 					  "i,3,3")]
 		     UNSPEC_PSHUFB))
-   (clobber (match_scratch:V4SI 3 "=X,&x,&Yv"))]
+   (clobber (match_scratch:V4SI 3 "=X,&x,&Yw"))]
   "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
   "@
    pshufb\t{%2, %0|%0, %2}
@@ -20715,10 +21052,10 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "ssse3_psign<mode>3"
-  [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,Yv")
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x")
 	(unspec:MMXMODEI
-	  [(match_operand:MMXMODEI 1 "register_operand" "0,0,Yv")
-	   (match_operand:MMXMODEI 2 "register_mmxmem_operand" "ym,x,Yv")]
+	  [(match_operand:MMXMODEI 1 "register_operand" "0,0,x")
+	   (match_operand:MMXMODEI 2 "register_mmxmem_operand" "ym,x,x")]
 	  UNSPEC_PSIGN))]
   "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
   "@
@@ -20738,7 +21075,7 @@
 	  (unspec:VI1_AVX512
 	    [(match_operand:VI1_AVX512 1 "register_operand" "v")
 	     (match_operand:VI1_AVX512 2 "nonimmediate_operand" "vm")
-	     (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")]
+	     (match_operand:SI 3 "const_0_to_255_mul_8_operand")]
 	    UNSPEC_PALIGNR)
 	(match_operand:VI1_AVX512 4 "nonimm_or_0_operand" "0C")
 	(match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
@@ -20759,7 +21096,7 @@
 	(unspec:SSESCALARMODE
 	  [(match_operand:SSESCALARMODE 1 "register_operand" "0,<v_Yw>")
 	   (match_operand:SSESCALARMODE 2 "vector_operand" "xBm,<v_Yw>m")
-	   (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n,n")]
+	   (match_operand:SI 3 "const_0_to_255_mul_8_operand")]
 	  UNSPEC_PALIGNR))]
   "TARGET_SSSE3"
 {
@@ -20785,10 +21122,10 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn_and_split "ssse3_palignrdi"
-  [(set (match_operand:DI 0 "register_operand" "=y,x,Yv")
-	(unspec:DI [(match_operand:DI 1 "register_operand" "0,0,Yv")
-		    (match_operand:DI 2 "register_mmxmem_operand" "ym,x,Yv")
-		    (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n,n,n")]
+  [(set (match_operand:DI 0 "register_operand" "=y,x,Yw")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "0,0,Yw")
+		    (match_operand:DI 2 "register_mmxmem_operand" "ym,x,Yw")
+		    (match_operand:SI 3 "const_0_to_255_mul_8_operand")]
 		   UNSPEC_PALIGNR))]
   "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3"
 {
@@ -21075,7 +21412,7 @@
 	   (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
 	   (lt:VF_128_256
 	     (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")
-	     (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))]
+	     (match_operand:<sseintvecmode> 4 "const0_operand"))]
 	  UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
   "#"
@@ -21107,7 +21444,7 @@
 	   (subreg:<ssebytemode>
 	     (lt:VI48_AVX
 	       (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x")
-	       (match_operand:VI48_AVX 4 "const0_operand" "C,C,C")) 0)]
+	       (match_operand:VI48_AVX 4 "const0_operand")) 0)]
 	  UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
   "#"
@@ -21162,7 +21499,7 @@
 	(unspec:VF_128_256
 	  [(match_operand:VF_128_256 1 "vector_operand" "%0,0,x")
 	   (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
-	   (match_operand:SI 3 "const_0_to_255_operand" "n,n,n")]
+	   (match_operand:SI 3 "const_0_to_255_operand")]
 	  UNSPEC_DP))]
   "TARGET_SSE4_1"
   "@
@@ -21200,7 +21537,7 @@
 	(unspec:VI1_AVX2
 	  [(match_operand:VI1_AVX2 1 "register_operand" "0,0,x")
 	   (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
-	   (match_operand:SI 3 "const_0_to_255_operand" "n,n,n")]
+	   (match_operand:SI 3 "const_0_to_255_operand")]
 	  UNSPEC_MPSADBW))]
   "TARGET_SSE4_1"
   "@
@@ -21289,7 +21626,7 @@
 	  [(match_operand:VI1_AVX2 1 "register_operand"  "0,0,x")
 	   (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
 	   (lt:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")
-			(match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))]
+			(match_operand:VI1_AVX2 4 "const0_operand"))]
 	  UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
   "#"
@@ -21333,7 +21670,7 @@
 	(vec_merge:V8_128
 	  (match_operand:V8_128 2 "vector_operand" "YrBm,*xBm,xm")
 	  (match_operand:V8_128 1 "register_operand" "0,0,x")
-	  (match_operand:SI 3 "const_0_to_255_operand" "n,n,n")))]
+	  (match_operand:SI 3 "const_0_to_255_operand")))]
   "TARGET_SSE4_1"
   "@
    pblendw\t{%3, %2, %0|%0, %2, %3}
@@ -21389,11 +21726,12 @@
      emit_insn (gen_avx2_pblend<ssemodesuffix> (tmp, operands[1],
 						operands[2], blendw_idx));
 
-     operands[0] = lowpart_subreg (V8SImode, operands[0], <MODE>mode);
+     rtx op0 = gen_reg_rtx (V8SImode);
      tmp = lowpart_subreg (V8SImode, tmp, <MODE>mode);
      operands[1] = lowpart_subreg (V8SImode, operands[1], <MODE>mode);
-     emit_insn (gen_avx2_pblenddv8si (operands[0], operands[1],
+     emit_insn (gen_avx2_pblenddv8si (op0, operands[1],
 				      tmp, blendd_idx));
+     emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8SImode));
   }
 
   DONE;
@@ -21404,7 +21742,7 @@
 	(vec_merge:V16_256
 	  (match_operand:V16_256 2 "nonimmediate_operand" "xm")
 	  (match_operand:V16_256 1 "register_operand" "x")
-	  (match_operand:SI 3 "avx2_pblendw_operand" "n")))]
+	  (match_operand:SI 3 "avx2_pblendw_operand")))]
   "TARGET_AVX2"
 {
   operands[3] = GEN_INT (INTVAL (operands[3]) & 0xff);
@@ -21421,7 +21759,7 @@
 	(vec_merge:VI4_AVX2
 	  (match_operand:VI4_AVX2 2 "nonimmediate_operand" "xm")
 	  (match_operand:VI4_AVX2 1 "register_operand" "x")
-	  (match_operand:SI 3 "const_0_to_255_operand" "n")))]
+	  (match_operand:SI 3 "const_0_to_255_operand")))]
   "TARGET_AVX2"
   "vpblendd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ssemov")
@@ -21458,9 +21796,9 @@
 	(vec_select:V32QI
 	  (vec_concat:V64QI
 	    (match_operand:V32QI 1 "nonimmediate_operand" "vm")
-	    (match_operand:V32QI 2 "const0_operand" "C"))
+	    (match_operand:V32QI 2 "const0_operand"))
 	  (match_parallel 3 "pmovzx_parallel"
-	    [(match_operand 4 "const_int_operand" "n")])))]
+	    [(match_operand 4 "const_int_operand")])))]
   "TARGET_AVX2"
   "#"
   "&& reload_completed"
@@ -21477,10 +21815,10 @@
 	    (subreg:V32QI
 	      (vec_concat:VI248_256
 		(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
-		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
-	    (match_operand:V32QI 3 "const0_operand" "C"))
+		(match_operand:<ssehalfvecmode> 2 "const0_operand")) 0)
+	    (match_operand:V32QI 3 "const0_operand"))
 	  (match_parallel 4 "pmovzx_parallel"
-	    [(match_operand 5 "const_int_operand" "n")])))]
+	    [(match_operand 5 "const_int_operand")])))]
   "TARGET_AVX2"
   "#"
   "&& reload_completed"
@@ -21512,9 +21850,9 @@
 	(vec_select:V64QI
 	  (vec_concat:V128QI
 	    (match_operand:V64QI 1 "nonimmediate_operand" "vm")
-	    (match_operand:V64QI 2 "const0_operand" "C"))
+	    (match_operand:V64QI 2 "const0_operand"))
 	  (match_parallel 3 "pmovzx_parallel"
-	    [(match_operand 4 "const_int_operand" "n")])))]
+	    [(match_operand 4 "const_int_operand")])))]
   "TARGET_AVX512BW"
   "#"
   "&& reload_completed"
@@ -21531,10 +21869,10 @@
 	    (subreg:V64QI
 	      (vec_concat:VI248_512
 		(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
-		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
-	    (match_operand:V64QI 3 "const0_operand" "C"))
+		(match_operand:<ssehalfvecmode> 2 "const0_operand")) 0)
+	    (match_operand:V64QI 3 "const0_operand"))
 	  (match_parallel 4 "pmovzx_parallel"
-	    [(match_operand 5 "const_int_operand" "n")])))]
+	    [(match_operand 5 "const_int_operand")])))]
   "TARGET_AVX512BW"
   "#"
   "&& reload_completed"
@@ -21604,9 +21942,9 @@
 	(vec_select:V16QI
 	  (vec_concat:V32QI
 	    (match_operand:V16QI 1 "vector_operand" "YrBm,*xBm,Ywm")
-	    (match_operand:V16QI 2 "const0_operand" "C,C,C"))
+	    (match_operand:V16QI 2 "const0_operand"))
 	  (match_parallel 3 "pmovzx_parallel"
-	    [(match_operand 4 "const_int_operand" "n,n,n")])))]
+	    [(match_operand 4 "const_int_operand")])))]
   "TARGET_SSE4_1"
   "#"
   "&& reload_completed"
@@ -21637,10 +21975,10 @@
 	    (subreg:V16QI
 	      (vec_concat:VI248_128
 		(match_operand:<ssehalfvecmode> 1 "vector_operand" "YrBm,*xBm,Ywm")
-		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C,C,C")) 0)
-	    (match_operand:V16QI 3 "const0_operand" "C,C,C"))
+		(match_operand:<ssehalfvecmode> 2 "const0_operand")) 0)
+	    (match_operand:V16QI 3 "const0_operand"))
 	  (match_parallel 4 "pmovzx_parallel"
-	    [(match_operand 5 "const_int_operand" "n,n,n")])))]
+	    [(match_operand 5 "const_int_operand")])))]
   "TARGET_SSE4_1"
   "#"
   "&& reload_completed"
@@ -21843,9 +22181,9 @@
 	(vec_select:V32HI
 	  (vec_concat:V64HI
 	    (match_operand:V32HI 1 "nonimmediate_operand" "vm")
-	    (match_operand:V32HI 2 "const0_operand" "C"))
+	    (match_operand:V32HI 2 "const0_operand"))
 	  (match_parallel 3 "pmovzx_parallel"
-	    [(match_operand 4 "const_int_operand" "n")])))]
+	    [(match_operand 4 "const_int_operand")])))]
   "TARGET_AVX512F"
   "#"
   "&& reload_completed"
@@ -21862,10 +22200,10 @@
 	    (subreg:V32HI
 	      (vec_concat:VI148_512
 	        (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
-		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
-	    (match_operand:V32HI 3 "const0_operand" "C"))
+		(match_operand:<ssehalfvecmode> 2 "const0_operand")) 0)
+	    (match_operand:V32HI 3 "const0_operand"))
 	  (match_parallel 4 "pmovzx_parallel"
-	    [(match_operand 5 "const_int_operand" "n")])))]
+	    [(match_operand 5 "const_int_operand")])))]
   "TARGET_AVX512F"
   "#"
   "&& reload_completed"
@@ -21897,9 +22235,9 @@
 	(vec_select:V16HI
 	  (vec_concat:V32HI
 	    (match_operand:V16HI 1 "nonimmediate_operand" "vm")
-	    (match_operand:V16HI 2 "const0_operand" "C"))
+	    (match_operand:V16HI 2 "const0_operand"))
 	  (match_parallel 3 "pmovzx_parallel"
-	    [(match_operand 4 "const_int_operand" "n")])))]
+	    [(match_operand 4 "const_int_operand")])))]
   "TARGET_AVX2"
   "#"
   "&& reload_completed"
@@ -21916,10 +22254,10 @@
 	    (subreg:V16HI
 	      (vec_concat:VI148_256
 		(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
-		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C")) 0)
-	    (match_operand:V16HI 3 "const0_operand" "C"))
+		(match_operand:<ssehalfvecmode> 2 "const0_operand")) 0)
+	    (match_operand:V16HI 3 "const0_operand"))
 	  (match_parallel 4 "pmovzx_parallel"
-	    [(match_operand 5 "const_int_operand" "n")])))]
+	    [(match_operand 5 "const_int_operand")])))]
   "TARGET_AVX2"
   "#"
   "&& reload_completed"
@@ -21995,9 +22333,9 @@
 	(vec_select:V8HI
 	  (vec_concat:V16HI
 	    (match_operand:V8HI 1 "vector_operand" "YrBm,*xBm,vm")
-	    (match_operand:V8HI 2 "const0_operand" "C,C,C"))
+	    (match_operand:V8HI 2 "const0_operand"))
 	  (match_parallel 3 "pmovzx_parallel"
-	    [(match_operand 4 "const_int_operand" "n,n,n")])))]
+	    [(match_operand 4 "const_int_operand")])))]
   "TARGET_SSE4_1"
   "#"
   "&& reload_completed"
@@ -22026,10 +22364,10 @@
 	    (subreg:V8HI
 	      (vec_concat:VI148_128
 		(match_operand:<ssehalfvecmode> 1 "vector_operand" "YrBm,*xBm,vm")
-		(match_operand:<ssehalfvecmode> 2 "const0_operand" "C,C,C")) 0)
-	    (match_operand:V8HI 3 "const0_operand" "C,C,C"))
+		(match_operand:<ssehalfvecmode> 2 "const0_operand")) 0)
+	    (match_operand:V8HI 3 "const0_operand"))
 	  (match_parallel 4 "pmovzx_parallel"
-	    [(match_operand 5 "const_int_operand" "n,n,n")])))]
+	    [(match_operand 5 "const_int_operand")])))]
   "TARGET_SSE4_1"
   "#"
   "&& reload_completed"
@@ -22187,15 +22525,52 @@
    (set_attr "prefix" "orig,orig,maybe_evex")
    (set_attr "mode" "TI")])
 
+(define_insn "*sse4_1_<code>v2qiv2di2<mask_name>_1"
+  [(set (match_operand:V2DI 0 "register_operand" "=v")
+	(any_extend:V2DI
+	 (match_operand:V2QI 1 "memory_operand" "m")))]
+  "TARGET_SSE4_1 && <mask_avx512vl_condition>"
+  "%vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "TI")])
+
 (define_expand "<insn>v2qiv2di2"
   [(set (match_operand:V2DI 0 "register_operand")
 	(any_extend:V2DI
-	  (match_operand:V2QI 1 "register_operand")))]
+	 (match_operand:V2QI 1 "nonimmediate_operand")))]
   "TARGET_SSE4_1"
 {
-  rtx op1 = force_reg (V2QImode, operands[1]);
-  op1 = lowpart_subreg (V16QImode, op1, V2QImode);
-  emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], op1));
+  if (!MEM_P (operands[1]))
+    {
+      rtx op1 = force_reg (V2QImode, operands[1]);
+      op1 = lowpart_subreg (V16QImode, op1, V2QImode);
+      emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], op1));
+      DONE;
+    }
+})
+
+(define_insn_and_split "*sse4_1_zero_extendv2qiv2di2_2"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(zero_extend:V2DI
+	 (vec_select:V2QI
+	  (subreg:V16QI
+	   (vec_merge:V8_128
+	    (vec_duplicate:V8_128
+	     (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))
+	    (match_operand:V8_128 2 "const0_operand")
+	    (const_int 1)) 0)
+	  (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE4_1 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (!MEM_P (operands[1]))
+    operands[1] = force_reg (<ssescalarmode>mode, operands[1]);
+  operands[1] = lowpart_subreg (V2QImode, operands[1], <ssescalarmode>mode);
+  emit_insn (gen_zero_extendv2qiv2di2 (operands[0], operands[1]));
   DONE;
 })
 
@@ -22350,9 +22725,9 @@
 	(vec_select:V16SI
 	  (vec_concat:V32SI
 	    (match_operand:V16SI 1 "nonimmediate_operand" "vm")
-	    (match_operand:V16SI 2 "const0_operand" "C"))
+	    (match_operand:V16SI 2 "const0_operand"))
 	  (match_parallel 3 "pmovzx_parallel"
-	    [(match_operand 4 "const_int_operand" "n")])))]
+	    [(match_operand 4 "const_int_operand")])))]
   "TARGET_AVX512F"
   "#"
   "&& reload_completed"
@@ -22368,10 +22743,10 @@
 	  (vec_concat:V32SI
 	    (vec_concat:V16SI
 	      (match_operand:V8SI 1 "nonimmediate_operand" "vm")
-	      (match_operand:V8SI 2 "const0_operand" "C"))
-	    (match_operand:V16SI 3 "const0_operand" "C"))
+	      (match_operand:V8SI 2 "const0_operand"))
+	    (match_operand:V16SI 3 "const0_operand"))
 	  (match_parallel 4 "pmovzx_parallel"
-	    [(match_operand 5 "const_int_operand" "n")])))]
+	    [(match_operand 5 "const_int_operand")])))]
   "TARGET_AVX512F"
   "#"
   "&& reload_completed"
@@ -22402,9 +22777,9 @@
 	(vec_select:V8SI
 	  (vec_concat:V16SI
 	    (match_operand:V8SI 1 "nonimmediate_operand" "vm")
-	    (match_operand:V8SI 2 "const0_operand" "C"))
+	    (match_operand:V8SI 2 "const0_operand"))
 	  (match_parallel 3 "pmovzx_parallel"
-	    [(match_operand 4 "const_int_operand" "n")])))]
+	    [(match_operand 4 "const_int_operand")])))]
   "TARGET_AVX2"
   "#"
   "&& reload_completed"
@@ -22420,10 +22795,10 @@
 	  (vec_concat:V16SI
 	    (vec_concat:V8SI
 	      (match_operand:V4SI 1 "nonimmediate_operand" "vm")
-	      (match_operand:V4SI 2 "const0_operand" "C"))
-	    (match_operand:V8SI 3 "const0_operand" "C"))
+	      (match_operand:V4SI 2 "const0_operand"))
+	    (match_operand:V8SI 3 "const0_operand"))
 	  (match_parallel 4 "pmovzx_parallel"
-	    [(match_operand 5 "const_int_operand" "n")])))]
+	    [(match_operand 5 "const_int_operand")])))]
   "TARGET_AVX2"
   "#"
   "&& reload_completed"
@@ -22486,9 +22861,9 @@
 	(vec_select:V4SI
 	  (vec_concat:V8SI
 	    (match_operand:V4SI 1 "vector_operand" "YrBm,*xBm,vm")
-	    (match_operand:V4SI 2 "const0_operand" "C,C,C"))
+	    (match_operand:V4SI 2 "const0_operand"))
 	  (match_parallel 3 "pmovzx_parallel"
-	    [(match_operand 4 "const_int_operand" "n,n,n")])))]
+	    [(match_operand 4 "const_int_operand")])))]
   "TARGET_SSE4_1"
   "#"
   "&& reload_completed"
@@ -22514,10 +22889,10 @@
 	  (vec_concat:V8SI
 	    (vec_concat:V4SI
 	      (match_operand:V2SI 1 "vector_operand" "YrBm, *xBm, vm")
-	      (match_operand:V2SI 2 "const0_operand" "C,C,C"))
-	    (match_operand:V4SI 3 "const0_operand" "C,C,C"))
+	      (match_operand:V2SI 2 "const0_operand"))
+	    (match_operand:V4SI 3 "const0_operand"))
 	  (match_parallel 4 "pmovzx_parallel"
-	    [(match_operand 5 "const_int_operand" "n,n,n")])))]
+	    [(match_operand 5 "const_int_operand")])))]
   "TARGET_SSE4_1"
   "#"
   "&& reload_completed"
@@ -22621,7 +22996,7 @@
   [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
 	(unspec:VF_128_256
 	  [(match_operand:VF_128_256 1 "vector_operand" "YrBm,*xBm,xm")
-	   (match_operand:SI 2 "const_0_to_15_operand" "n,n,n")]
+	   (match_operand:SI 2 "const_0_to_15_operand")]
 	  UNSPEC_ROUND))]
   "TARGET_SSE4_1"
   "%vround<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
@@ -22714,7 +23089,7 @@
 	(vec_merge:VF_128
 	  (unspec:VF_128
 	    [(match_operand:VF_128 2 "nonimmediate_operand" "Yrm,*xm,xm,vm")
-	     (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")]
+	     (match_operand:SI 3 "const_0_to_15_operand")]
 	    UNSPEC_ROUND)
 	  (match_operand:VF_128 1 "register_operand" "0,0,x,v")
 	  (const_int 1)))]
@@ -22738,7 +23113,7 @@
 	  (vec_duplicate:VFH_128
 	    (unspec:<ssescalarmode>
 	      [(match_operand:<ssescalarmode> 2 "nonimmediate_operand" "Yrm,*xm,xm,vm")
-	       (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")]
+	       (match_operand:SI 3 "const_0_to_15_operand")]
 	      UNSPEC_ROUND))
 	  (match_operand:VFH_128 1 "register_operand" "0,0,x,v")
 	  (const_int 1)))]
@@ -22851,7 +23226,7 @@
 	   (match_operand:SI 3 "register_operand" "a,a")
 	   (match_operand:V16QI 4 "nonimmediate_operand" "x,m")
 	   (match_operand:SI 5 "register_operand" "d,d")
-	   (match_operand:SI 6 "const_0_to_255_operand" "n,n")]
+	   (match_operand:SI 6 "const_0_to_255_operand")]
 	  UNSPEC_PCMPESTR))
    (set (match_operand:V16QI 1 "register_operand" "=Yz,Yz")
 	(unspec:V16QI
@@ -22911,7 +23286,7 @@
 	   (match_operand:SI 2 "register_operand" "a,a")
 	   (match_operand:V16QI 3 "nonimmediate_operand" "x,m")
 	   (match_operand:SI 4 "register_operand" "d,d")
-	   (match_operand:SI 5 "const_0_to_255_operand" "n,n")]
+	   (match_operand:SI 5 "const_0_to_255_operand")]
 	  UNSPEC_PCMPESTR))
    (set (reg:CC FLAGS_REG)
 	(unspec:CC
@@ -22939,7 +23314,7 @@
 	   (match_operand:SI 2 "register_operand" "a,a")
 	   (match_operand:V16QI 3 "nonimmediate_operand" "x,m")
 	   (match_operand:SI 4 "register_operand" "d,d")
-	   (match_operand:SI 5 "const_0_to_255_operand" "n,n")]
+	   (match_operand:SI 5 "const_0_to_255_operand")]
 	  UNSPEC_PCMPESTR))
    (set (reg:CC FLAGS_REG)
 	(unspec:CC
@@ -22967,7 +23342,7 @@
 	   (match_operand:SI 3 "register_operand" "a,a,a,a")
 	   (match_operand:V16QI 4 "nonimmediate_operand" "x,m,x,m")
 	   (match_operand:SI 5 "register_operand" "d,d,d,d")
-	   (match_operand:SI 6 "const_0_to_255_operand" "n,n,n,n")]
+	   (match_operand:SI 6 "const_0_to_255_operand")]
 	  UNSPEC_PCMPESTR))
    (clobber (match_scratch:V16QI 0 "=Yz,Yz,X,X"))
    (clobber (match_scratch:SI    1 "= X, X,c,c"))]
@@ -22991,7 +23366,7 @@
 	(unspec:SI
 	  [(match_operand:V16QI 2 "register_operand" "x,x")
 	   (match_operand:V16QI 3 "nonimmediate_operand" "x,m")
-	   (match_operand:SI 4 "const_0_to_255_operand" "n,n")]
+	   (match_operand:SI 4 "const_0_to_255_operand")]
 	  UNSPEC_PCMPISTR))
    (set (match_operand:V16QI 1 "register_operand" "=Yz,Yz")
 	(unspec:V16QI
@@ -23042,7 +23417,7 @@
 	(unspec:SI
 	  [(match_operand:V16QI 1 "register_operand" "x,x")
 	   (match_operand:V16QI 2 "nonimmediate_operand" "x,m")
-	   (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
+	   (match_operand:SI 3 "const_0_to_255_operand")]
 	  UNSPEC_PCMPISTR))
    (set (reg:CC FLAGS_REG)
 	(unspec:CC
@@ -23066,7 +23441,7 @@
 	(unspec:V16QI
 	  [(match_operand:V16QI 1 "register_operand" "x,x")
 	   (match_operand:V16QI 2 "nonimmediate_operand" "x,m")
-	   (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
+	   (match_operand:SI 3 "const_0_to_255_operand")]
 	  UNSPEC_PCMPISTR))
    (set (reg:CC FLAGS_REG)
 	(unspec:CC
@@ -23090,7 +23465,7 @@
 	(unspec:CC
 	  [(match_operand:V16QI 2 "register_operand" "x,x,x,x")
 	   (match_operand:V16QI 3 "nonimmediate_operand" "x,m,x,m")
-	   (match_operand:SI 4 "const_0_to_255_operand" "n,n,n,n")]
+	   (match_operand:SI 4 "const_0_to_255_operand")]
 	  UNSPEC_PCMPISTR))
    (clobber (match_scratch:V16QI 0 "=Yz,Yz,X,X"))
    (clobber (match_scratch:SI    1 "= X, X,c,c"))]
@@ -23137,9 +23512,9 @@
 	[(unspec:P
 	   [(match_operand:P 2 "vsib_address_operand" "Tv")
 	    (match_operand:VI48_512 1 "register_operand" "v")
-	    (match_operand:SI 3 "const1248_operand" "n")]
+	    (match_operand:SI 3 "const1248_operand")]
 	   UNSPEC_VSIBADDR)])
-      (match_operand:SI 4 "const_2_to_3_operand" "n")]
+      (match_operand:SI 4 "const_2_to_3_operand")]
      UNSPEC_GATHER_PREFETCH)]
   "TARGET_AVX512PF"
 {
@@ -23184,9 +23559,9 @@
 	[(unspec:P
 	   [(match_operand:P 2 "vsib_address_operand" "Tv")
 	    (match_operand:VI4_256_8_512 1 "register_operand" "v")
-	    (match_operand:SI 3 "const1248_operand" "n")]
+	    (match_operand:SI 3 "const1248_operand")]
 	   UNSPEC_VSIBADDR)])
-      (match_operand:SI 4 "const_2_to_3_operand" "n")]
+      (match_operand:SI 4 "const_2_to_3_operand")]
      UNSPEC_GATHER_PREFETCH)]
   "TARGET_AVX512PF"
 {
@@ -23231,9 +23606,9 @@
 	[(unspec:P
 	   [(match_operand:P 2 "vsib_address_operand" "Tv")
 	    (match_operand:VI48_512 1 "register_operand" "v")
-	    (match_operand:SI 3 "const1248_operand" "n")]
+	    (match_operand:SI 3 "const1248_operand")]
 	   UNSPEC_VSIBADDR)])
-      (match_operand:SI 4 "const2367_operand" "n")]
+      (match_operand:SI 4 "const2367_operand")]
      UNSPEC_SCATTER_PREFETCH)]
   "TARGET_AVX512PF"
 {
@@ -23280,9 +23655,9 @@
 	[(unspec:P
 	   [(match_operand:P 2 "vsib_address_operand" "Tv")
 	    (match_operand:VI4_256_8_512 1 "register_operand" "v")
-	    (match_operand:SI 3 "const1248_operand" "n")]
+	    (match_operand:SI 3 "const1248_operand")]
 	   UNSPEC_VSIBADDR)])
-      (match_operand:SI 4 "const2367_operand" "n")]
+      (match_operand:SI 4 "const2367_operand")]
      UNSPEC_SCATTER_PREFETCH)]
   "TARGET_AVX512PF"
 {
@@ -23492,6 +23867,30 @@
   "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "sse4arg")])
 
+;; Recognize XOP's vpcmov from canonical (xor (and (xor t f) c) f)
+(define_split
+  [(set (match_operand:V_128_256 0 "register_operand")
+	(xor:V_128_256
+	  (and:V_128_256
+	    (xor:V_128_256 (match_operand:V_128_256 1 "register_operand")
+			   (match_operand:V_128_256 2 "register_operand"))
+	    (match_operand:V_128_256 3 "nonimmediate_operand"))
+	  (match_operand:V_128_256 4 "register_operand")))]
+  "TARGET_XOP
+   && (REGNO (operands[4]) == REGNO (operands[1])
+       || REGNO (operands[4]) == REGNO (operands[2]))"
+  [(set (match_dup 0) (if_then_else:V_128_256 (match_dup 3)
+					      (match_dup 5)
+					      (match_dup 4)))]
+{
+  /* To handle the commutivity of XOR, operands[4] is either operands[1]
+     or operands[2], we need operands[5] to be the other one.  */
+  if (REGNO (operands[4]) == REGNO (operands[1]))
+    operands[5] = operands[2];
+  else
+    operands[5] = operands[1];
+})
+
 ;; XOP horizontal add/subtract instructions
 (define_insn "xop_phadd<u>bw"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
@@ -23820,7 +24219,7 @@
   [(set (match_operand:VI_128 0 "register_operand" "=x")
 	(rotate:VI_128
 	 (match_operand:VI_128 1 "nonimmediate_operand" "xm")
-	 (match_operand:SI 2 "const_0_to_<sserotatemax>_operand" "n")))]
+	 (match_operand:SI 2 "const_0_to_<sserotatemax>_operand")))]
   "TARGET_XOP"
   "vprot<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseishft")
@@ -23831,7 +24230,7 @@
   [(set (match_operand:VI_128 0 "register_operand" "=x")
 	(rotatert:VI_128
 	 (match_operand:VI_128 1 "nonimmediate_operand" "xm")
-	 (match_operand:SI 2 "const_0_to_<sserotatemax>_operand" "n")))]
+	 (match_operand:SI 2 "const_0_to_<sserotatemax>_operand")))]
   "TARGET_XOP"
 {
   operands[3]
@@ -24218,7 +24617,9 @@
 	  vec_perm_builder sel (4, 4, 1);
 	  sel.quick_grow (4);
 	  rtx arg0, arg1;
-	  rtx op1 = lowpart_subreg (V4SImode, operands[1], V2DImode);
+	  rtx op1 = lowpart_subreg (V4SImode,
+				    force_reg (V2DImode, operands[1]),
+				    V2DImode);
 	  rtx target = gen_reg_rtx (V4SImode);
 	  if (UINTVAL (operands[2]) >= 63)
 	    {
@@ -24264,8 +24665,9 @@
 	      sel[3] = 7;
 	    }
 	  vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
-	  bool ok = targetm.vectorize.vec_perm_const (V4SImode, target,
-						      arg0, arg1, indices);
+	  bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode,
+						      target, arg0, arg1,
+						      indices);
 	  gcc_assert (ok);
 	  emit_move_insn (operands[0],
 			  lowpart_subreg (V2DImode, target, V4SImode));
@@ -24284,9 +24686,11 @@
 	  else
 	    {
 	      rtx temp = gen_reg_rtx (V4SImode);
-	      emit_insn (gen_ashrv4si3 (temp, lowpart_subreg (V4SImode,
-							      operands[1],
-							      V2DImode),
+	      emit_insn (gen_ashrv4si3 (temp,
+					lowpart_subreg (V4SImode,
+							force_reg (V2DImode,
+								   operands[1]),
+							V2DImode),
 					GEN_INT (31)));
 	      zero_or_all_ones = gen_reg_rtx (V4SImode);
 	      emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
@@ -24433,7 +24837,7 @@
 	(unspec:VI_128
 	  [(match_operand:VI_128 1 "register_operand" "x")
 	   (match_operand:VI_128 2 "nonimmediate_operand" "xm")
-	   (match_operand:SI 3 "const_int_operand" "n")]
+	   (match_operand:SI 3 "const_int_operand")]
 	  UNSPEC_XOP_TRUEFALSE))]
   "TARGET_XOP"
 {
@@ -24453,7 +24857,7 @@
 	  [(match_operand:VF_128_256 1 "register_operand" "x,x")
 	   (match_operand:VF_128_256 2 "nonimmediate_operand" "x,m")
 	   (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "xm,x")
-	   (match_operand:SI 4 "const_0_to_3_operand" "n,n")]
+	   (match_operand:SI 4 "const_0_to_3_operand")]
 	  UNSPEC_VPERMIL2))]
   "TARGET_XOP"
   "vpermil2<ssemodesuffix>\t{%4, %3, %2, %1, %0|%0, %1, %2, %3, %4}"
@@ -24541,7 +24945,7 @@
 (define_insn "aeskeygenassist"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(unspec:V2DI [(match_operand:V2DI 1 "vector_operand" "xBm")
-		      (match_operand:SI 2 "const_0_to_255_operand" "n")]
+		      (match_operand:SI 2 "const_0_to_255_operand")]
 		     UNSPEC_AESKEYGENASSIST))]
   "TARGET_AES"
   "%vaeskeygenassist\t{%2, %1, %0|%0, %1, %2}"
@@ -24555,7 +24959,7 @@
   [(set (match_operand:V2DI 0 "register_operand" "=x,x")
 	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
 		      (match_operand:V2DI 2 "vector_operand" "xBm,xm")
-		      (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
+		      (match_operand:SI 3 "const_0_to_255_operand")]
 		     UNSPEC_PCLMUL))]
   "TARGET_PCLMUL"
   "@
@@ -25007,7 +25411,7 @@
 	(unspec:V4DI
 	  [(match_operand:V4DI 1 "register_operand" "x")
 	   (match_operand:V4DI 2 "nonimmediate_operand" "xm")
-	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
+	   (match_operand:SI 3 "const_0_to_255_operand")]
 	  UNSPEC_VPERMTI))]
   "TARGET_AVX2"
   "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
@@ -25551,7 +25955,7 @@
 	(unspec:AVX256MODE2P
 	  [(match_operand:AVX256MODE2P 1 "register_operand" "x")
 	   (match_operand:AVX256MODE2P 2 "nonimmediate_operand" "xm")
-	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
+	   (match_operand:SI 3 "const_0_to_255_operand")]
 	  UNSPEC_VPERMIL2F128))]
   "TARGET_AVX"
   "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
@@ -25591,7 +25995,7 @@
       (vec_select:V_128
 	(match_operand:V_128 1 "register_operand" "0,Yw")
 	(match_parallel 2 "palignr_operand"
-	  [(match_operand 3 "const_int_operand" "n,n")])))]
+	  [(match_operand 3 "const_int_operand")])))]
   "TARGET_SSSE3"
 {
   operands[2] = (GEN_INT (INTVAL (operands[3])
@@ -26261,7 +26665,7 @@
   [(set (match_operand:V8HI 0 "register_operand" "=v")
 	(vec_concat:V8HI
 	  (unspec:V4HI [(match_operand:V4SF 1 "register_operand" "v")
-			(match_operand:SI 2 "const_0_to_255_operand" "N")]
+			(match_operand:SI 2 "const_0_to_255_operand")]
 		       UNSPEC_VCVTPS2PH)
 	  (match_operand:V4HI 3 "const0_operand")))]
   "(TARGET_F16C || TARGET_AVX512VL) && <mask_avx512vl_condition>"
@@ -26273,7 +26677,7 @@
 (define_insn "*vcvtps2ph_store<merge_mask_name>"
   [(set (match_operand:V4HI 0 "memory_operand" "=m")
 	(unspec:V4HI [(match_operand:V4SF 1 "register_operand" "v")
-		      (match_operand:SI 2 "const_0_to_255_operand" "N")]
+		      (match_operand:SI 2 "const_0_to_255_operand")]
 		     UNSPEC_VCVTPS2PH))]
   "TARGET_F16C || TARGET_AVX512VL"
   "vcvtps2ph\t{%2, %1, %0<merge_mask_operand3>|%0<merge_mask_operand3>, %1, %2}"
@@ -26284,7 +26688,7 @@
 (define_insn "vcvtps2ph256<mask_name>"
   [(set (match_operand:V8HI 0 "register_operand" "=v")
 	(unspec:V8HI [(match_operand:V8SF 1 "register_operand" "v")
-		      (match_operand:SI 2 "const_0_to_255_operand" "N")]
+		      (match_operand:SI 2 "const_0_to_255_operand")]
 		     UNSPEC_VCVTPS2PH))]
   "TARGET_F16C || TARGET_AVX512VL"
   "vcvtps2ph\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
@@ -26296,7 +26700,7 @@
 (define_insn "*vcvtps2ph256<merge_mask_name>"
   [(set (match_operand:V8HI 0 "memory_operand" "=m")
 	(unspec:V8HI [(match_operand:V8SF 1 "register_operand" "v")
-		      (match_operand:SI 2 "const_0_to_255_operand" "N")]
+		      (match_operand:SI 2 "const_0_to_255_operand")]
 		     UNSPEC_VCVTPS2PH))]
   "TARGET_F16C || TARGET_AVX512VL"
   "vcvtps2ph\t{%2, %1, %0<merge_mask_operand3>|%0<merge_mask_operand3>, %1, %2}"
@@ -26309,7 +26713,7 @@
   [(set (match_operand:V16HI 0 "register_operand" "=v")
 	(unspec:V16HI
 	  [(match_operand:V16SF 1 "register_operand" "v")
-	   (match_operand:SI 2 "const_0_to_255_operand" "N")]
+	   (match_operand:SI 2 "const_0_to_255_operand")]
 	  UNSPEC_VCVTPS2PH))]
   "TARGET_AVX512F"
   "vcvtps2ph\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
@@ -26321,7 +26725,7 @@
   [(set (match_operand:V16HI 0 "memory_operand" "=m")
 	(unspec:V16HI
 	  [(match_operand:V16SF 1 "register_operand" "v")
-	   (match_operand:SI 2 "const_0_to_255_operand" "N")]
+	   (match_operand:SI 2 "const_0_to_255_operand")]
 	  UNSPEC_VCVTPS2PH))]
   "TARGET_AVX512F"
   "vcvtps2ph\t{%2, %1, %0<merge_mask_operand3>|%0<merge_mask_operand3>, %1, %2}"
@@ -26379,7 +26783,7 @@
 	     [(unspec:P
 		[(match_operand:P 3 "vsib_address_operand" "Tv")
 		 (match_operand:<VEC_GATHER_IDXSI> 4 "register_operand" "x")
-		 (match_operand:SI 6 "const1248_operand" "n")]
+		 (match_operand:SI 6 "const1248_operand")]
 		UNSPEC_VSIBADDR)])
 	   (mem:BLK (scratch))
 	   (match_operand:VEC_GATHER_MODE 5 "register_operand" "1")]
@@ -26399,7 +26803,7 @@
 	     [(unspec:P
 		[(match_operand:P 2 "vsib_address_operand" "Tv")
 		 (match_operand:<VEC_GATHER_IDXSI> 3 "register_operand" "x")
-		 (match_operand:SI 5 "const1248_operand" "n")]
+		 (match_operand:SI 5 "const1248_operand")]
 		UNSPEC_VSIBADDR)])
 	   (mem:BLK (scratch))
 	   (match_operand:VEC_GATHER_MODE 4 "register_operand" "1")]
@@ -26440,7 +26844,7 @@
 	     [(unspec:P
 		[(match_operand:P 3 "vsib_address_operand" "Tv")
 		 (match_operand:<VEC_GATHER_IDXDI> 4 "register_operand" "x")
-		 (match_operand:SI 6 "const1248_operand" "n")]
+		 (match_operand:SI 6 "const1248_operand")]
 		UNSPEC_VSIBADDR)])
 	   (mem:BLK (scratch))
 	   (match_operand:<VEC_GATHER_SRCDI> 5 "register_operand" "1")]
@@ -26460,7 +26864,7 @@
 	     [(unspec:P
 		[(match_operand:P 2 "vsib_address_operand" "Tv")
 		 (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand" "x")
-		 (match_operand:SI 5 "const1248_operand" "n")]
+		 (match_operand:SI 5 "const1248_operand")]
 		UNSPEC_VSIBADDR)])
 	   (mem:BLK (scratch))
 	   (match_operand:<VEC_GATHER_SRCDI> 4 "register_operand" "1")]
@@ -26485,7 +26889,7 @@
 	       [(unspec:P
 		  [(match_operand:P 3 "vsib_address_operand" "Tv")
 		   (match_operand:<VEC_GATHER_IDXDI> 4 "register_operand" "x")
-		   (match_operand:SI 6 "const1248_operand" "n")]
+		   (match_operand:SI 6 "const1248_operand")]
 		  UNSPEC_VSIBADDR)])
 	     (mem:BLK (scratch))
 	     (match_operand:<VEC_GATHER_SRCDI> 5 "register_operand" "1")]
@@ -26508,7 +26912,7 @@
 	       [(unspec:P
 		  [(match_operand:P 2 "vsib_address_operand" "Tv")
 		   (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand" "x")
-		   (match_operand:SI 5 "const1248_operand" "n")]
+		   (match_operand:SI 5 "const1248_operand")]
 		  UNSPEC_VSIBADDR)])
 	     (mem:BLK (scratch))
 	     (match_operand:<VEC_GATHER_SRCDI> 4 "register_operand" "1")]
@@ -26550,7 +26954,7 @@
 	     [(unspec:P
 		[(match_operand:P 4 "vsib_address_operand" "Tv")
 		 (match_operand:<VEC_GATHER_IDXSI> 3 "register_operand" "v")
-		 (match_operand:SI 5 "const1248_operand" "n")]
+		 (match_operand:SI 5 "const1248_operand")]
 		UNSPEC_VSIBADDR)])]
 	  UNSPEC_GATHER))
    (clobber (match_scratch:<avx512fmaskmode> 2 "=&Yk"))]
@@ -26571,7 +26975,7 @@
 	     [(unspec:P
 		[(match_operand:P 3 "vsib_address_operand" "Tv")
 		 (match_operand:<VEC_GATHER_IDXSI> 2 "register_operand" "v")
-		 (match_operand:SI 4 "const1248_operand" "n")]
+		 (match_operand:SI 4 "const1248_operand")]
 		UNSPEC_VSIBADDR)])]
 	  UNSPEC_GATHER))
    (clobber (match_scratch:<avx512fmaskmode> 1 "=&Yk"))]
@@ -26612,7 +27016,7 @@
 	     [(unspec:P
 		[(match_operand:P 4 "vsib_address_operand" "Tv")
 		 (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand" "v")
-		 (match_operand:SI 5 "const1248_operand" "n")]
+		 (match_operand:SI 5 "const1248_operand")]
 		UNSPEC_VSIBADDR)])]
 	  UNSPEC_GATHER))
    (clobber (match_scratch:QI 2 "=&Yk"))]
@@ -26633,7 +27037,7 @@
 	     [(unspec:P
 		[(match_operand:P 3 "vsib_address_operand" "Tv")
 		 (match_operand:<VEC_GATHER_IDXDI> 2 "register_operand" "v")
-		 (match_operand:SI 4 "const1248_operand" "n")]
+		 (match_operand:SI 4 "const1248_operand")]
 		UNSPEC_VSIBADDR)])]
 	  UNSPEC_GATHER))
    (clobber (match_scratch:QI 1 "=&Yk"))]
@@ -26678,7 +27082,7 @@
 	  [(unspec:P
 	     [(match_operand:P 0 "vsib_address_operand" "Tv")
 	      (match_operand:<VEC_GATHER_IDXSI> 2 "register_operand" "v")
-	      (match_operand:SI 4 "const1248_operand" "n")
+	      (match_operand:SI 4 "const1248_operand")
 	      (match_operand:<avx512fmaskmode> 6 "register_operand" "1")]
 	     UNSPEC_VSIBADDR)])
 	(unspec:VI48F
@@ -26718,7 +27122,7 @@
 	  [(unspec:P
 	     [(match_operand:P 0 "vsib_address_operand" "Tv")
 	      (match_operand:<VEC_GATHER_IDXDI> 2 "register_operand" "v")
-	      (match_operand:SI 4 "const1248_operand" "n")
+	      (match_operand:SI 4 "const1248_operand")
 	      (match_operand:QI 6 "register_operand" "1")]
 	     UNSPEC_VSIBADDR)])
 	(unspec:VI48F
@@ -26939,7 +27343,7 @@
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
           (unspec:<avx512fmaskmode>
             [(match_operand:VFH_AVX512VL 1 "vector_operand" "vm")
-             (match_operand 2 "const_0_to_255_operand" "n")]
+             (match_operand 2 "const_0_to_255_operand")]
              UNSPEC_FPCLASS))]
    "TARGET_AVX512DQ || VALID_AVX512FP16_REG_MODE(<MODE>mode)"
    "vfpclass<ssemodesuffix><vecmemsuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}";
@@ -26953,7 +27357,7 @@
 	(and:<avx512fmaskmode>
 	  (unspec:<avx512fmaskmode>
 	    [(match_operand:VFH_128 1 "nonimmediate_operand" "vm")
-             (match_operand 2 "const_0_to_255_operand" "n")]
+             (match_operand 2 "const_0_to_255_operand")]
 	    UNSPEC_FPCLASS)
 	  (const_int 1)))]
    "TARGET_AVX512DQ || VALID_AVX512FP16_REG_MODE(<MODE>mode)"
@@ -27077,7 +27481,7 @@
 	(unspec:V4SI
 	  [(match_operand:V4SI 1 "register_operand" "0")
 	   (match_operand:V4SI 2 "vector_operand" "xBm")
-	   (match_operand:SI 3 "const_0_to_3_operand" "n")]
+	   (match_operand:SI 3 "const_0_to_3_operand")]
 	  UNSPEC_SHA1RNDS4))]
   "TARGET_SHA"
   "sha1rnds4\t{%3, %2, %0|%0, %2, %3}"
@@ -27304,7 +27708,7 @@
 	    [(match_operand:V16SF 1 "register_operand" "0")
 	     (match_operand:V64SF 2 "register_operand" "v")
 	     (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FMADD)
-	  (match_operand:V16SF 4 "const0_operand" "C")
+	  (match_operand:V16SF 4 "const0_operand")
 	  (match_operand:HI 5 "register_operand" "Yk")))]
   "TARGET_AVX5124FMAPS"
   "v4fmaddps\t{%3, %g2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %g2, %3}"
@@ -27345,7 +27749,7 @@
 	    [(match_operand:V4SF 1 "register_operand" "0")
 	     (match_operand:V64SF 2 "register_operand" "v")
 	     (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FMADD)
-	  (match_operand:V4SF 4 "const0_operand" "C")
+	  (match_operand:V4SF 4 "const0_operand")
 	  (match_operand:QI 5 "register_operand" "Yk")))]
   "TARGET_AVX5124FMAPS"
   "v4fmaddss\t{%3, %x2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %x2, %3}"
@@ -27386,7 +27790,7 @@
 	    [(match_operand:V16SF 1 "register_operand" "0")
 	     (match_operand:V64SF 2 "register_operand" "v")
 	     (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FNMADD)
-	  (match_operand:V16SF 4 "const0_operand" "C")
+	  (match_operand:V16SF 4 "const0_operand")
 	  (match_operand:HI 5 "register_operand" "Yk")))]
   "TARGET_AVX5124FMAPS"
   "v4fnmaddps\t{%3, %g2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %g2, %3}"
@@ -27427,7 +27831,7 @@
 	    [(match_operand:V4SF 1 "register_operand" "0")
 	     (match_operand:V64SF 2 "register_operand" "v")
 	     (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FNMADD)
-	  (match_operand:V4SF 4 "const0_operand" "C")
+	  (match_operand:V4SF 4 "const0_operand")
 	  (match_operand:QI 5 "register_operand" "Yk")))]
   "TARGET_AVX5124FMAPS"
   "v4fnmaddss\t{%3, %x2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %x2, %3}"
@@ -27468,7 +27872,7 @@
 	    [(match_operand:V16SI 1 "register_operand" "0")
 	     (match_operand:V64SI 2 "register_operand" "v")
 	     (match_operand:V4SI 3 "memory_operand" "m")] UNSPEC_VP4DPWSSD)
-	  (match_operand:V16SI 4 "const0_operand" "C")
+	  (match_operand:V16SI 4 "const0_operand")
 	  (match_operand:HI 5 "register_operand" "Yk")))]
   "TARGET_AVX5124VNNIW"
   "vp4dpwssd\t{%3, %g2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %g2, %3}"
@@ -27509,7 +27913,7 @@
 	    [(match_operand:V16SI 1 "register_operand" "0")
 	     (match_operand:V64SI 2 "register_operand" "v")
 	     (match_operand:V4SI 3 "memory_operand" "m")] UNSPEC_VP4DPWSSDS)
-	  (match_operand:V16SI 4 "const0_operand" "C")
+	  (match_operand:V16SI 4 "const0_operand")
 	  (match_operand:HI 5 "register_operand" "Yk")))]
   "TARGET_AVX5124VNNIW"
   "vp4dpwssds\t{%3, %g2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %g2, %3}"
@@ -27585,7 +27989,7 @@
 	(unspec:VI1_AVX512F
 	  [(match_operand:VI1_AVX512F 1 "register_operand" "0,v")
 	   (match_operand:VI1_AVX512F 2 "vector_operand" "xBm,vm")
-	   (match_operand 3 "const_0_to_255_operand" "n,n")]
+	   (match_operand 3 "const_0_to_255_operand")]
 	  UNSPEC_GF2P8AFFINEINV))]
   "TARGET_GFNI"
   "@
@@ -27602,7 +28006,7 @@
 	(unspec:VI1_AVX512F
 	  [(match_operand:VI1_AVX512F 1 "register_operand" "0,v")
 	   (match_operand:VI1_AVX512F 2 "vector_operand" "xBm,vm")
-	   (match_operand 3 "const_0_to_255_operand" "n,n")]
+	   (match_operand 3 "const_0_to_255_operand")]
 	  UNSPEC_GF2P8AFFINE))]
   "TARGET_GFNI"
   "@
@@ -27635,7 +28039,7 @@
 	(unspec:VI248_AVX512VL
 	  [(match_operand:VI248_AVX512VL 1 "register_operand" "v")
 	   (match_operand:VI248_AVX512VL 2 "nonimmediate_operand" "vm")
-	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
+	   (match_operand:SI 3 "const_0_to_255_operand")]
 	  UNSPEC_VPSHRD))]
   "TARGET_AVX512VBMI2"
   "vpshrd<ssemodesuffix>\t{%3, %2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2, %3 }"
@@ -27646,7 +28050,7 @@
 	(unspec:VI248_AVX512VL
 	  [(match_operand:VI248_AVX512VL 1 "register_operand" "v")
 	   (match_operand:VI248_AVX512VL 2 "nonimmediate_operand" "vm")
-	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
+	   (match_operand:SI 3 "const_0_to_255_operand")]
 	  UNSPEC_VPSHLD))]
   "TARGET_AVX512VBMI2"
   "vpshld<ssemodesuffix>\t{%3, %2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2, %3 }"
@@ -27702,7 +28106,7 @@
 	     (match_operand:VI248_AVX512VL 2 "register_operand" "v")
 	     (match_operand:VI248_AVX512VL 3 "nonimmediate_operand" "vm")]
 	    UNSPEC_VPSHRDV)
-	  (match_operand:VI248_AVX512VL 4 "const0_operand" "C")
+	  (match_operand:VI248_AVX512VL 4 "const0_operand")
 	  (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
   "TARGET_AVX512VBMI2"
   "vpshrdv<ssemodesuffix>\t{%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %2, %3 }"
@@ -27759,7 +28163,7 @@
 	     (match_operand:VI248_AVX512VL 2 "register_operand" "v")
 	     (match_operand:VI248_AVX512VL 3 "nonimmediate_operand" "vm")]
 	    UNSPEC_VPSHLDV)
-	  (match_operand:VI248_AVX512VL 4 "const0_operand" "C")
+	  (match_operand:VI248_AVX512VL 4 "const0_operand")
 	  (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
   "TARGET_AVX512VBMI2"
   "vpshldv<ssemodesuffix>\t{%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %2, %3 }"
@@ -27781,8 +28185,12 @@
     ||((TARGET_AVX512VNNI && TARGET_AVX512VL)
 	    || TARGET_AVXVNNI))"
 {
-  operands[1] = lowpart_subreg (<VI1SI>mode, operands[1], <MODE>mode);
-  operands[2] = lowpart_subreg (<VI1SI>mode, operands[2], <MODE>mode);
+  operands[1] = lowpart_subreg (<VI1SI>mode,
+				force_reg (<MODE>mode, operands[1]),
+				<MODE>mode);
+  operands[2] = lowpart_subreg (<VI1SI>mode,
+				force_reg (<MODE>mode, operands[2]),
+				<MODE>mode);
   emit_insn (gen_rtx_SET (operands[0], operands[3]));
   emit_insn (gen_vpdpbusd_<vi1si> (operands[0], operands[3],
 				  operands[1], operands[2]));
@@ -27851,7 +28259,7 @@
 	     (match_operand:VI4_AVX512VL 2 "register_operand" "v")
 	     (match_operand:VI4_AVX512VL 3 "nonimmediate_operand" "vm")
 	    ] UNSPEC_VPMADDUBSWACCD)
-	  (match_operand:VI4_AVX512VL 4 "const0_operand" "C")
+	  (match_operand:VI4_AVX512VL 4 "const0_operand")
 	  (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
   "TARGET_AVX512VNNI"
   "vpdpbusd\t{%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %2, %3 }"
@@ -27919,7 +28327,7 @@
 	     (match_operand:VI4_AVX512VL 2 "register_operand" "v")
 	     (match_operand:VI4_AVX512VL 3 "nonimmediate_operand" "vm")]
 	    UNSPEC_VPMADDUBSWACCSSD)
-	  (match_operand:VI4_AVX512VL 4 "const0_operand" "C")
+	  (match_operand:VI4_AVX512VL 4 "const0_operand")
 	  (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
   "TARGET_AVX512VNNI"
   "vpdpbusds\t{%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %2, %3 }"
@@ -27987,7 +28395,7 @@
 	     (match_operand:VI4_AVX512VL 2 "register_operand" "v")
 	     (match_operand:VI4_AVX512VL 3 "nonimmediate_operand" "vm")]
 	    UNSPEC_VPMADDWDACCD)
-	  (match_operand:VI4_AVX512VL 4 "const0_operand" "C")
+	  (match_operand:VI4_AVX512VL 4 "const0_operand")
 	  (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
   "TARGET_AVX512VNNI"
   "vpdpwssd\t{%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %2, %3 }"
@@ -28055,7 +28463,7 @@
 	     (match_operand:VI4_AVX512VL 2 "register_operand" "v")
 	     (match_operand:VI4_AVX512VL 3 "nonimmediate_operand" "vm")]
 	    UNSPEC_VPMADDWDACCSSD)
-	  (match_operand:VI4_AVX512VL 4 "const0_operand" "C")
+	  (match_operand:VI4_AVX512VL 4 "const0_operand")
 	  (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
   "TARGET_AVX512VNNI"
   "vpdpwssds\t{%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %2, %3 }"
@@ -28105,7 +28513,7 @@
   [(set (match_operand:VI8_FVL 0 "register_operand" "=v")
 	(unspec:VI8_FVL [(match_operand:VI8_FVL 1 "register_operand" "v")
 			 (match_operand:VI8_FVL 2 "vector_operand" "vm")
-			 (match_operand:SI 3 "const_0_to_255_operand" "n")]
+			 (match_operand:SI 3 "const_0_to_255_operand")]
 			UNSPEC_VPCLMULQDQ))]
   "TARGET_VPCLMULQDQ"
   "vpclmulqdq\t{%3, %2, %1, %0|%0, %1, %2, %3}"
@@ -28259,8 +28667,8 @@
 
 ;; KEYLOCKER
 (define_insn "loadiwkey"
-  [(unspec_volatile:V2DI [(match_operand:V2DI 0 "register_operand" "v")
-			  (match_operand:V2DI 1 "register_operand" "v")
+  [(unspec_volatile:V2DI [(match_operand:V2DI 0 "register_operand" "x")
+			  (match_operand:V2DI 1 "register_operand" "x")
 			  (match_operand:V2DI 2 "register_operand" "Yz")
 			  (match_operand:SI   3 "register_operand" "a")]
 			 UNSPECV_LOADIWKEY)
@@ -28393,7 +28801,7 @@
    (UNSPECV_AESENC256KLU8 "enc256kl")])
 
 (define_insn "aes<aesklvariant>u8"
-  [(set (match_operand:V2DI 0 "register_operand" "=v")
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(unspec_volatile:V2DI [(match_operand:V2DI 1 "register_operand" "0")
 			       (match_operand:BLK   2 "memory_operand" "m")]
 			      AESDECENCKL))
diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
index bb86f82..0b75882 100644
--- a/gcc/config/i386/subst.md
+++ b/gcc/config/i386/subst.md
@@ -137,7 +137,7 @@
  [(set (match_dup 0)
        (vec_merge:SUBST_V
 	 (match_dup 1)
-	 (match_operand:SUBST_V 2 "const0_operand" "C")
+	 (match_operand:SUBST_V 2 "const0_operand")
 	 (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))
 ])
 
@@ -155,7 +155,7 @@
 	(vec_merge:SUBST_V
 	  (vec_merge:SUBST_V
 	    (match_dup 1)
-	    (match_operand:SUBST_V 3 "const0_operand" "C")
+	    (match_operand:SUBST_V 3 "const0_operand")
 	    (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk"))
 	  (match_dup 2)
 	  (const_int 1)))])
@@ -171,7 +171,7 @@
  [(set (match_dup 0)
        (vec_merge:SUBST_CV
 	 (match_dup 1)
-	 (match_operand:SUBST_CV 2 "const0_operand" "C")
+	 (match_operand:SUBST_CV 2 "const0_operand")
 	 (unspec:<avx512fmaskmode>
 	   [(match_operand:<avx512fmaskcmode> 3 "register_operand" "Yk")]
 	   UNSPEC_COMPLEX_MASK)))
@@ -372,7 +372,7 @@
 	(vec_merge:SUBST_CV
 	  (vec_merge:SUBST_CV
 	    (match_dup 1)
-	    (match_operand:SUBST_CV 3 "const0_operand" "C")
+	    (match_operand:SUBST_CV 3 "const0_operand")
 	    (unspec:<avx512fmaskmode>
 	      [(match_operand:<avx512fmaskcmode> 4 "register_operand" "Yk")]
 	      UNSPEC_COMPLEX_MASK))
@@ -478,5 +478,5 @@
   [(set (match_dup 0)
         (vec_merge:SUBST_V
 	  (match_dup 1)
-	  (match_operand:SUBST_V 2 "const0_operand" "C")
+	  (match_operand:SUBST_V 2 "const0_operand")
 	  (match_operand:<avx512fmaskhalfmode> 3 "register_operand" "Yk")))])
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 820e9ca..92634d5 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -745,10 +745,10 @@
 	    [(match_operand:SWI 0 "memory_operand" "+m")
 	     (match_operand:SI 3 "const_int_operand")]		;; model
 	    UNSPECV_XCHG)
-	  (match_operand:SWI 2 "const_int_operand" "i")))
+	  (match_operand:SWI 2 "const_int_operand")))
    (set (match_dup 0)
 	(plus:SWI (match_dup 0)
-		  (match_operand:SWI 1 "const_int_operand" "i")))]
+		  (match_operand:SWI 1 "const_int_operand")))]
   "(unsigned HOST_WIDE_INT) INTVAL (operands[1])
    == -(unsigned HOST_WIDE_INT) INTVAL (operands[2])"
 {
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 017ffa6..6c9066c 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1866,7 +1866,7 @@ struct processor_costs skylake_cost = {
   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
   6, 6,				/* SSE->integer and integer->SSE moves */
-  5, 5,				/* mask->integer and integer->mask moves */
+  6, 6,				/* mask->integer and integer->mask moves */
   {8, 8, 8},				/* cost of loading mask register
 					   in QImode, HImode, SImode.  */
   {6, 6, 6},				/* cost if storing mask register
@@ -1897,15 +1897,15 @@ struct processor_costs skylake_cost = {
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
   17,					/* CLEAR_RATIO */
-  {4, 4, 4},				/* cost of loading integer registers
+  {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {6, 6, 6},				/* cost of storing integer registers */
-  {6, 6, 6, 10, 20},			/* cost of loading SSE register
+  {8, 8, 8},				/* cost of storing integer registers */
+  {8, 8, 8, 8, 16},			/* cost of loading SSE register
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {8, 8, 8, 8, 16},			/* cost of storing SSE register
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
+  {8, 8, 8, 8, 16},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
@@ -1992,7 +1992,7 @@ struct processor_costs icelake_cost = {
   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
   6, 6,				/* SSE->integer and integer->SSE moves */
-  5, 5,				/* mask->integer and integer->mask moves */
+  6, 6,				/* mask->integer and integer->mask moves */
   {8, 8, 8},				/* cost of loading mask register
 					   in QImode, HImode, SImode.  */
   {6, 6, 6},				/* cost if storing mask register
@@ -2023,15 +2023,15 @@ struct processor_costs icelake_cost = {
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
   17,					/* CLEAR_RATIO */
-  {4, 4, 4},				/* cost of loading integer registers
+  {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {6, 6, 6},				/* cost of storing integer registers */
-  {6, 6, 6, 10, 20},			/* cost of loading SSE register
+  {8, 8, 8},				/* cost of storing integer registers */
+  {8, 8, 8, 8, 16},			/* cost of loading SSE register
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
   {8, 8, 8, 8, 16},			/* cost of storing SSE register
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
+  {8, 8, 8, 8, 16},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
@@ -2146,13 +2146,13 @@ struct processor_costs alderlake_cost = {
   {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {6, 6, 6},				/* cost of storing integer registers */
-  {6, 6, 6, 10, 15},			/* cost of loading SSE register
+  {8, 8, 8},				/* cost of storing integer registers */
+  {8, 8, 8, 10, 15},			/* cost of loading SSE register
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  {6, 6, 6, 10, 15},			/* cost of storing SSE register
+  {8, 8, 8, 10, 15},			/* cost of storing SSE register
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
-  {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  {8, 8, 8, 10, 15},			/* cost of unaligned loads.  */
+  {8, 8, 8, 10, 15},			/* cost of unaligned storess.  */
   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
   6,					/* cost of moving SSE register to integer.  */
   18, 6,				/* Gather load static, per_elt.  */
@@ -3088,6 +3088,121 @@ struct processor_costs intel_cost = {
   "16",					/* Func alignment.  */
 };
 
+/* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU.  */
+static stringop_algs lujiazui_memcpy[2] = {
+  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
+			 {-1, libcall, false}}},
+  {libcall, {{12, unrolled_loop, true}, {32, loop, false},
+			 {6144, rep_prefix_8_byte, false},
+			 {-1, libcall, false}}}};
+static stringop_algs lujiazui_memset[2] = {
+  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
+			 {-1, libcall, false}}},
+  {libcall, {{12, loop, true}, {32, loop, false},
+			 {640, rep_prefix_8_byte, false},
+			 {-1, libcall, false}}}};
+static const
+struct processor_costs lujiazui_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2.  */
+  6,				/* cost for loading QImode using movzbl.  */
+  {6, 6, 6},			/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},			/* cost of storing integer registers.  */
+  2,					/* cost of reg,reg fld/fst.  */
+  {6, 6, 8},			/* cost of loading fp registers
+				in SFmode, DFmode and XFmode.  */
+  {6, 6, 8},			/* cost of storing fp registers
+				in SFmode, DFmode and XFmode.  */
+  2,				/* cost of moving MMX register.  */
+  {6, 6},			/* cost of loading MMX registers
+				in SImode and DImode.  */
+  {6, 6},			/* cost of storing MMX registers
+				in SImode and DImode.  */
+  2, 3, 4,			/* cost of moving XMM,YMM,ZMM register.  */
+  {6, 6, 6, 10, 15},	/* cost of loading SSE registers
+				in 32,64,128,256 and 512-bit.  */
+  {6, 6, 6, 10, 15},	/* cost of storing SSE registers
+				in 32,64,128,256 and 512-bit.  */
+  6, 6,				/* SSE->integer and integer->SSE moves.  */
+  6, 6,				/* mask->integer and integer->mask moves.  */
+  {6, 6, 6},		/* cost of loading mask register
+				in QImode, HImode, SImode.  */
+  {6, 6, 6},		/* cost if storing mask register
+				in QImode, HImode, SImode.  */
+  2,				/* cost of moving mask register.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction.  */
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),			/* variable shift costs.  */
+  COSTS_N_INSNS (1),			/* constant shift costs.  */
+  {COSTS_N_INSNS (2),			/* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),			/*				 HI.  */
+   COSTS_N_INSNS (3),			/*				 SI.  */
+   COSTS_N_INSNS (12),			/*				 DI.  */
+   COSTS_N_INSNS (14)},		/*				 other.  */
+  0,				/* cost of multiply per each bit set.  */
+  {COSTS_N_INSNS (22),			/* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (24),			/*			    HI.  */
+   COSTS_N_INSNS (24),			/*			    SI.  */
+   COSTS_N_INSNS (150),			/*			    DI.  */
+   COSTS_N_INSNS (152)},		/*			    other.  */
+  COSTS_N_INSNS (1),			/* cost of movsx.  */
+  COSTS_N_INSNS (1),			/* cost of movzx.  */
+  8,					/* "large" insn.  */
+  17,					/* MOVE_RATIO.  */
+  6,					/* CLEAR_RATIO.  */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},			/* cost of storing integer registers.  */
+  {6, 6, 6, 10, 15},			/* cost of loading SSE register
+				in 32bit, 64bit, 128bit, 256bit and 512bit.  */
+  {6, 6, 6, 10, 15},			/* cost of storing SSE register
+				in 32bit, 64bit, 128bit, 256bit and 512bit.  */
+  {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
+  {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  2, 3, 4,			/* cost of moving XMM,YMM,ZMM register.  */
+  6,				/* cost of moving SSE register to integer.  */
+  18, 6,				/* Gather load static, per_elt.  */
+  18, 6,				/* Gather store static, per_elt.  */
+  32,				  	/* size of l1 cache.  */
+  4096,					/* size of l2 cache.  */
+  64,					/* size of prefetch block.  */
+  /* Lujiazui processor never drop prefetches, like AMD processors.  */
+  100,					/* number of parallel prefetches.  */
+  3,					/* Branch cost.  */
+  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (22),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (60),			/* cost of SQRTSD instruction.  */
+  1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  lujiazui_memcpy,
+  lujiazui_memset,
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
+  "16:11:8",				/* Loop alignment.  */
+  "16:11:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
+};
+
 /* Generic should produce code tuned for Core-i7 (and newer chips)
    and btver1 (and newer chips).  */
 
diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc
index e413d04..1ffaeef 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -58,6 +58,7 @@ ix86_issue_rate (void)
     case PROCESSOR_K8:
     case PROCESSOR_AMDFAM10:
     case PROCESSOR_BTVER1:
+    case PROCESSOR_LUJIAZUI:
       return 3;
 
     case PROCESSOR_BDVER1:
@@ -368,6 +369,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
 
     case PROCESSOR_ATHLON:
     case PROCESSOR_K8:
+    case PROCESSOR_LUJIAZUI:
       memory = get_attr_memory (insn);
 
       /* Show ability of reorder buffer to hide latency of load by executing
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 82ca0ae..540e45d 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -41,8 +41,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 /* X86_TUNE_SCHEDULE: Enable scheduling.  */
 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
           m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
-	  | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_GOLDMONT
-	  | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	  | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
+     | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
    on modern chips.  Prefer stores affecting whole integer register
@@ -51,8 +51,8 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
           m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2
 	  | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
-	  | m_KNL | m_KNM | m_AMD_MULTIPLE | m_TREMONT | m_ALDERLAKE
-	  | m_GENERIC)
+	  | m_KNL | m_KNM | m_AMD_MULTIPLE | m_LUJIAZUI | m_TREMONT
+	  | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
    destinations to be 128bit to allow register renaming on 128bit SSE units,
@@ -62,7 +62,8 @@ DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
    that can be partly masked by careful scheduling of moves.  */
 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
-	  | m_BDVER | m_ZNVER | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	  | m_BDVER | m_ZNVER | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE
+	  | m_GENERIC)
 
 /* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids
    partial write to the destination in scalar SSE conversion from FP
@@ -70,14 +71,14 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY,
 	  "sse_partial_reg_fp_converts_dependency",
 	  m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
-	  | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC)
+	  | m_BDVER | m_ZNVER | m_LUJIAZUI | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial
    write to the destination in scalar SSE conversion from integer to FP.  */
 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
 	  "sse_partial_reg_converts_dependency",
 	  m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
-	  | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC)
+	  | m_BDVER | m_ZNVER | m_LUJIAZUI | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before
    several insns to break false dependency on the dest register for GLC
@@ -108,7 +109,7 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
 DEF_TUNE (X86_TUNE_MOVX, "movx",
           m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 	  | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL
-	  | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE
+	  | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
 	  | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
@@ -116,31 +117,31 @@ DEF_TUNE (X86_TUNE_MOVX, "movx",
 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
           m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
 	  | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE
-	  | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	  | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
    conditional jump instruction for 32 bit TARGET.  */
 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
-	  m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC)
+	  m_CORE_ALL | m_BDVER | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 
 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
    conditional jump instruction for TARGET_64BIT.  */
 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
 	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
-	  | m_ZNVER | m_GENERIC)
+	  | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 
 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
    subsequent conditional jump instruction when the condition jump
    check sign flag (SF) or overflow flag (OF).  */
 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
 	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
-	  | m_ZNVER | m_GENERIC)
+	  | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 
 /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
    jump instruction when the alu instruction produces the CCFLAG consumed by
    the conditional jump instruction. */
 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
-          m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
+		  m_SANDYBRIDGE | m_CORE_AVX2 | m_LUJIAZUI | m_GENERIC)
 
 
 /*****************************************************************************/
@@ -157,7 +158,7 @@ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
 
 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
 	  m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
-	  | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8)
+	  | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8 | m_LUJIAZUI)
 
 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
    considered on critical path.  */
@@ -171,15 +172,15 @@ DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
 
 /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
-	  m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_TREMONT
-	  | m_ALDERLAKE | m_GENERIC)
+	  m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
+	  | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
    Some chips, like 486 and Pentium works faster with separate load
    and push instructions.  */
 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
-	  | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	  | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
    over esp subtraction.  */
@@ -234,7 +235,7 @@ DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_benefi
 
 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
    on 16-bit immediate moves into memory on Core2 and Corei7.  */
-DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
+DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_LUJIAZUI | m_GENERIC)
 
 /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
    as "add mem, reg".  */
@@ -249,19 +250,20 @@ DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
           ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 	    | m_BONNELL | m_SILVERMONT | m_INTEL |  m_KNL | m_KNM | m_GOLDMONT
-	    | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC))
+	    | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_LUJIAZUI
+	    | m_GENERIC))
 
 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
    for DFmode copies */
 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
           ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
-	    | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GOLDMONT
-	    | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC))
+	    | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
+	    | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC))
 
 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
    will impact LEA instruction selection. */
 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
-	 | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
+	 | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_LUJIAZUI)
 
 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
@@ -294,7 +296,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
    move/set sequences of bytes with known size.  */
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
 	  "prefer_known_rep_movsb_stosb",
-	  m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
+	  m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512 | m_LUJIAZUI)
 
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
    compact prologues and epilogues by issuing a misaligned moves.  This
@@ -303,15 +305,15 @@ DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
    FIXME: This may actualy be a win on more targets than listed here.  */
 DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
 	  "misaligned_move_string_pro_epilogues",
-	  m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_TREMONT
+	  m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_LUJIAZUI | m_TREMONT
 	  | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 	  | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
-	  | m_BTVER | m_ZNVER | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
-	  | m_ALDERLAKE | m_GENERIC)
+	  | m_BTVER | m_ZNVER | m_LUJIAZUI | m_GOLDMONT | m_GOLDMONT_PLUS
+	  | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
@@ -321,13 +323,14 @@ DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
           m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
-	  | m_LAKEMONT | m_AMD_MULTIPLE | m_GOLDMONT | m_GOLDMONT_PLUS
-	  | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	  | m_LAKEMONT | m_AMD_MULTIPLE | m_LUJIAZUI | m_GOLDMONT
+	  | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
    for bit-manipulation instructions.  */
 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
-	  m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	  m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_LUJIAZUI
+     | m_GENERIC)
 
 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
@@ -339,18 +342,19 @@ DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
    if-converted sequence to one.  */
 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
 	  m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT
-	  | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	  | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_LUJIAZUI | m_GENERIC)
 
 /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence.  */
 DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
-	 m_CORE_ALL | m_BDVER | m_ZNVER | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	 m_CORE_ALL | m_BDVER | m_ZNVER | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE
+    | m_GENERIC)
 
 /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
    generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
    (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions.  */
 DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
 	  m_CORE_ALL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
-	  | m_GOLDMONT_PLUS)
+	  | m_GOLDMONT_PLUS | m_LUJIAZUI)
 
 /*****************************************************************************/
 /* 387 instruction selection tuning                                          */
@@ -367,17 +371,17 @@ DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
           ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
 	    | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE
-	    | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
-	    | m_GENERIC))
+	    | m_LUJIAZUI | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
+	    | m_ALDERLAKE | m_GENERIC))
 
 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
-DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
+DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE | m_LUJIAZUI)
 
 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
-	  | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GOLDMONT
-	  | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	  | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_LUJIAZUI
+	  | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 
 /*****************************************************************************/
 /* SSE instruction selection tuning                                          */
@@ -393,14 +397,14 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
 	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
 	  | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
-	  | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_GENERIC)
+	  | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 
 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores
    instead of a sequence loading registers by parts.  */
 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
 	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
-	  | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
-	  | m_TREMONT | m_ALDERLAKE | m_BDVER | m_ZNVER | m_GENERIC)
+	  | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
+	  | m_BDVER | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 
 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single
    precision 128bit instructions instead of double where possible.   */
@@ -409,13 +413,14 @@ DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optim
 
 /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
-	  m_AMD_MULTIPLE | m_CORE_ALL | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	  m_AMD_MULTIPLE | m_LUJIAZUI | m_CORE_ALL | m_TREMONT | m_ALDERLAKE
+	  | m_GENERIC)
 
 /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
    xorps/xorpd and other variants.  */
 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
 	  m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
-	  | m_TREMONT | m_ALDERLAKE | m_GENERIC)
+	  | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 
 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
    to SSE registers.  If disabled, the moves will be done by storing
@@ -464,7 +469,18 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
 	  m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 	  | m_INTEL)
 
-/* X86_TUNE_USE_GATHER: Use gather instructions.  */
+/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
+   elements.  */
+DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
+	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC))
+
+/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
+   elements.  */
+DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
+	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC))
+
+/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
+   elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
 	  ~(m_ZNVER1 | m_ZNVER2 | m_ALDERLAKE | m_GENERIC))
 
diff --git a/gcc/config/i386/x86gprintrin.h b/gcc/config/i386/x86gprintrin.h
index 017ec29..e0be01d 100644
--- a/gcc/config/i386/x86gprintrin.h
+++ b/gcc/config/i386/x86gprintrin.h
@@ -24,7 +24,7 @@
 #ifndef _X86GPRINTRIN_H_INCLUDED
 #define _X86GPRINTRIN_H_INCLUDED
 
-#if defined __MMX__ || defined __SSE__
+#if !defined _SOFT_FLOAT || defined __MMX__ || defined __SSE__
 #pragma GCC push_options
 #pragma GCC target("general-regs-only")
 #define __DISABLE_GENERAL_REGS_ONLY__
diff --git a/gcc/config/ia64/ia64.cc b/gcc/config/ia64/ia64.cc
index f9fb681..25e4a47 100644
--- a/gcc/config/ia64/ia64.cc
+++ b/gcc/config/ia64/ia64.cc
@@ -332,8 +332,8 @@ static fixed_size_mode ia64_get_reg_raw_mode (int regno);
 static section * ia64_hpux_function_section (tree, enum node_frequency,
 					     bool, bool);
 
-static bool ia64_vectorize_vec_perm_const (machine_mode, rtx, rtx, rtx,
-					   const vec_perm_indices &);
+static bool ia64_vectorize_vec_perm_const (machine_mode, machine_mode, rtx,
+					   rtx, rtx, const vec_perm_indices &);
 
 static unsigned int ia64_hard_regno_nregs (unsigned int, machine_mode);
 static bool ia64_hard_regno_mode_ok (unsigned int, machine_mode);
@@ -11751,9 +11751,13 @@ ia64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
 
 static bool
-ia64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
-			       rtx op1, const vec_perm_indices &sel)
+ia64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+			       rtx target, rtx op0, rtx op1,
+			       const vec_perm_indices &sel)
 {
+  if (vmode != op_mode)
+    return false;
+
   struct expand_vec_perm_d d;
   unsigned char perm[MAX_VECT_LEN];
   unsigned int i, nelt, which;
diff --git a/gcc/config/iq2000/iq2000.md b/gcc/config/iq2000/iq2000.md
index fdb346f..86361e2 100644
--- a/gcc/config/iq2000/iq2000.md
+++ b/gcc/config/iq2000/iq2000.md
@@ -165,6 +165,8 @@
 		 (const_string "yes")
 		 (const_string "no"))))
 
+;; Is this a bbi instruction or not
+(define_attr "bbi" "no,yes" (const_string "no"))
 
 ;; Describe a user's asm statement.
 (define_asm_attributes
@@ -183,11 +185,18 @@
    (nil)
    (nil)])
 
-(define_delay (eq_attr "type" "branch")
+;; GAS refuses to assemble bbi[n]l.  So for bbi instructions, do not
+;; allow them to annul-false.
+(define_delay (and (eq_attr "type" "branch") (eq_attr "bbi" "no"))
   [(and (eq_attr "dslot" "ok_in_dslot") (eq_attr "length" "4"))
    (nil)
    (and (eq_attr "branch_likely" "yes") (and (eq_attr "dslot" "ok_in_dslot") (eq_attr "length" "4")))])
 
+(define_delay (and (eq_attr "type" "branch") (eq_attr "bbi" "yes"))
+  [(and (eq_attr "dslot" "ok_in_dslot") (eq_attr "length" "4"))
+   (nil)
+   (nil)])
+
 (define_delay (eq_attr "type" "call")
   [(and (eq_attr "dslot" "ok_in_dslot") (eq_attr "length" "4"))
    (nil)
@@ -1114,7 +1123,8 @@
   ""
   "bb%A2\\t%0(31-%1),%P2%P3"
   [(set_attr "length" "4")
-   (set_attr "type" "branch")])
+   (set_attr "type" "branch")
+   (set_attr "bbi" "yes")])
 
 (define_insn ""
   [(set (pc)
@@ -1128,7 +1138,8 @@
   ""
   "bb%A3\\t%0(31-%1),%P2%P3"
   [(set_attr "length" "4")
-   (set_attr "type" "branch")])
+   (set_attr "type" "branch")
+   (set_attr "bbi" "yes")])
 
 (define_insn ""
   [(set (pc)
@@ -1142,7 +1153,8 @@
   "0"
   "bb%A2\\t%0(31-%1),%P2%P3"
   [(set_attr "length" "4")
-   (set_attr "type" "branch")])
+   (set_attr "type" "branch")
+   (set_attr "bbi" "yes")])
 
 (define_insn ""
   [(set (pc)
@@ -1156,7 +1168,8 @@
   "0"
   "bb%A3\\t%0(31-%1),%P2%P3"
   [(set_attr "length" "4")
-   (set_attr "type" "branch")])
+   (set_attr "type" "branch")
+   (set_attr "bbi" "yes")])
 
 (define_insn ""
   [(set (pc)
@@ -1169,7 +1182,8 @@
   ""
   "bb%A3\\t%0(%p1),%P2%P3"
   [(set_attr "length" "4")
-   (set_attr "type" "branch")])
+   (set_attr "type" "branch")
+   (set_attr "bbi" "yes")])
 
 (define_insn ""
   [(set (pc)
@@ -1182,7 +1196,8 @@
   ""
   "bb%A2\\t%0(%p1),%P2%P3"
   [(set_attr "length" "4")
-   (set_attr "type" "branch")])
+   (set_attr "type" "branch")
+   (set_attr "bbi" "yes")])
 
 ;;
 ;;  ....................
diff --git a/gcc/config/loongarch/constraints.md b/gcc/config/loongarch/constraints.md
new file mode 100644
index 0000000..d0bfddb
--- /dev/null
+++ b/gcc/config/loongarch/constraints.md
@@ -0,0 +1,202 @@
+;; Constraint definitions for LoongArch.
+;; Copyright (C) 2021-2022 Free Software Foundation, Inc.
+;; Contributed by Loongson Ltd.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; Register constraints
+
+;; "a" "A constant call global and noplt address."
+;; "b" <-----unused
+;; "c" "A constant call local address."
+;; "d" <-----unused
+;; "e" JIRL_REGS
+;; "f" FP_REGS
+;; "g" <-----unused
+;; "h" "A constant call plt address."
+;; "i" "Matches a general integer constant." (Global non-architectural)
+;; "j" SIBCALL_REGS
+;; "k" "A memory operand whose address is formed by a base register and
+;;      (optionally scaled) index register."
+;; "l" "A signed 16-bit constant."
+;; "m" "A memory operand whose address is formed by a base register and offset
+;;      that is suitable for use in instructions with the same addressing mode
+;;      as @code{st.w} and @code{ld.w}."
+;; "n" "Matches a non-symbolic integer constant." (Global non-architectural)
+;; "o" "Matches an offsettable memory reference." (Global non-architectural)
+;; "p" "Matches a general address." (Global non-architectural)
+;; "q" CSR_REGS
+;; "r" GENERAL_REGS (Global non-architectural)
+;; "s" "Matches a symbolic integer constant." (Global non-architectural)
+;; "t" "A constant call weak address"
+;; "u" "A signed 52bit constant and low 32-bit is zero (for logic instructions)"
+;; "v" "A signed 64-bit constant and low 44-bit is zero (for logic instructions)."
+;; "w" "Matches any valid memory."
+;; "x" <-----unused
+;; "y" <-----unused
+;; "z" FCC_REGS
+;; "A" <-----unused
+;; "B" <-----unused
+;; "C" <-----unused
+;; "D" <-----unused
+;; "E" "Matches a floating-point constant." (Global non-architectural)
+;; "F" "Matches a floating-point constant." (Global non-architectural)
+;; "G" "Floating-point zero."
+;; "H" <-----unused
+;; "I" "A signed 12-bit constant (for arithmetic instructions)."
+;; "J" "Integer zero."
+;; "K" "An unsigned 12-bit constant (for logic instructions)."
+;; "L" <-----unused
+;; "M" <-----unused
+;; "N" <-----unused
+;; "O" <-----unused
+;; "P" <-----unused
+;; "Q" <-----unused
+;; "R" <-----unused
+;; "S" <-----unused
+;; "T" <-----unused
+;; "U" <-----unused
+;; "V" "Matches a non-offsettable memory reference." (Global non-architectural)
+;; "W" <-----unused
+;; "X" "Matches anything." (Global non-architectural)
+;; "Y" -
+;;    "Yd"
+;;       "A constant @code{move_operand} that can be safely loaded using
+;;	  @code{la}."
+;;    "Yx"
+;; "Z" -
+;;    "ZC"
+;;      "A memory operand whose address is formed by a base register and offset
+;;       that is suitable for use in instructions with the same addressing mode
+;;       as @code{ll.w} and @code{sc.w}."
+;;    "ZB"
+;;      "An address that is held in a general-purpose register.
+;;      The offset is zero"
+;; "<" "Matches a pre-dec or post-dec operand." (Global non-architectural)
+;; ">" "Matches a pre-inc or post-inc operand." (Global non-architectural)
+
+(define_constraint "a"
+  "@internal
+   A constant call global and noplt address."
+  (match_operand 0 "is_const_call_global_noplt_symbol"))
+
+(define_constraint "c"
+  "@internal
+   A constant call local address."
+  (match_operand 0 "is_const_call_local_symbol"))
+
+(define_register_constraint "e" "JIRL_REGS"
+  "@internal")
+
+(define_register_constraint "f" "TARGET_HARD_FLOAT ? FP_REGS : NO_REGS"
+  "A floating-point register (if available).")
+
+(define_constraint "h"
+  "@internal
+   A constant call plt address."
+  (match_operand 0 "is_const_call_plt_symbol"))
+
+(define_register_constraint "j" "SIBCALL_REGS"
+  "@internal")
+
+(define_memory_constraint "k"
+  "A memory operand whose address is formed by a base register and (optionally scaled)
+   index register."
+  (and (match_code "mem")
+       (match_test "loongarch_base_index_address_p (XEXP (op, 0), mode)")))
+
+(define_constraint "l"
+"A signed 16-bit constant."
+(and (match_code "const_int")
+     (match_test "IMM16_OPERAND (ival)")))
+
+(define_memory_constraint "m"
+  "A memory operand whose address is formed by a base register and offset
+   that is suitable for use in instructions with the same addressing mode
+   as @code{st.w} and @code{ld.w}."
+  (and (match_code "mem")
+       (match_test "loongarch_12bit_offset_address_p (XEXP (op, 0), mode)")))
+
+(define_register_constraint "q" "CSR_REGS"
+  "A general-purpose register except for $r0 and $r1 for lcsr.")
+
+(define_constraint "t"
+  "@internal
+   A constant call weak address."
+  (match_operand 0 "is_const_call_weak_symbol"))
+
+(define_constraint "u"
+  "A signed 52bit constant and low 32-bit is zero (for logic instructions)."
+  (and (match_code "const_int")
+       (match_test "LU32I_OPERAND (ival)")))
+
+(define_constraint "v"
+  "A signed 64-bit constant and low 44-bit is zero (for logic instructions)."
+  (and (match_code "const_int")
+       (match_test "LU52I_OPERAND (ival)")))
+
+(define_register_constraint "z" "FCC_REGS"
+  "A floating-point condition code register.")
+
+;; Floating-point constraints
+
+(define_constraint "G"
+  "Floating-point zero."
+  (and (match_code "const_double")
+       (match_test "op == CONST0_RTX (mode)")))
+
+;; Integer constraints
+
+(define_constraint "I"
+  "A signed 12-bit constant (for arithmetic instructions)."
+  (and (match_code "const_int")
+       (match_test "IMM12_OPERAND (ival)")))
+
+(define_constraint "J"
+  "Integer zero."
+  (and (match_code "const_int")
+       (match_test "ival == 0")))
+
+(define_constraint "K"
+  "An unsigned 12-bit constant (for logic instructions)."
+  (and (match_code "const_int")
+       (match_test "IMM12_OPERAND_UNSIGNED (ival)")))
+
+(define_constraint "Yd"
+  "@internal
+   A constant @code{move_operand} that can be safely loaded using
+   @code{la}."
+  (and (match_operand 0 "move_operand")
+       (match_test "CONSTANT_P (op)")))
+
+(define_constraint "Yx"
+   "@internal"
+   (match_operand 0 "low_bitmask_operand"))
+
+(define_memory_constraint "ZC"
+  "A memory operand whose address is formed by a base register and offset
+   that is suitable for use in instructions with the same addressing mode
+   as @code{ll.w} and @code{sc.w}."
+  (and (match_code "mem")
+       (match_test "loongarch_14bit_shifted_offset_address_p (XEXP (op, 0), mode)")))
+
+(define_memory_constraint "ZB"
+  "@internal
+  An address that is held in a general-purpose register.
+  The offset is zero"
+  (and (match_code "mem")
+       (match_test "REG_P (XEXP (op, 0))")))
diff --git a/gcc/config/loongarch/generic.md b/gcc/config/loongarch/generic.md
new file mode 100644
index 0000000..6e5a093
--- /dev/null
+++ b/gcc/config/loongarch/generic.md
@@ -0,0 +1,118 @@
+;; Generic DFA-based pipeline description for LoongArch targets
+;; Copyright (C) 2021-2022 Free Software Foundation, Inc.
+;; Contributed by Loongson Ltd.
+;; Based on MIPS target for GNU compiler.
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "alu,imuldiv")
+
+(define_cpu_unit "alu" "alu")
+(define_cpu_unit "imuldiv" "imuldiv")
+
+;; Ghost instructions produce no real code.
+;; They exist purely to express an effect on dataflow.
+(define_insn_reservation "ghost" 0
+  (eq_attr "type" "ghost")
+  "nothing")
+
+(define_insn_reservation "generic_alu" 1
+  (eq_attr "type" "unknown,prefetch,prefetchx,condmove,const,arith,
+		   shift,slt,clz,trap,multi,nop,logical,signext,move")
+  "alu")
+
+(define_insn_reservation "generic_load" 3
+  (eq_attr "type" "load,fpload,fpidxload")
+  "alu")
+
+(define_insn_reservation "generic_store" 1
+  (eq_attr "type" "store,fpstore,fpidxstore")
+  "alu")
+
+(define_insn_reservation "generic_xfer" 2
+  (eq_attr "type" "mftg,mgtf")
+  "alu")
+
+(define_insn_reservation "generic_branch" 1
+  (eq_attr "type" "branch,jump,call")
+  "alu")
+
+(define_insn_reservation "generic_imul" 17
+  (eq_attr "type" "imul")
+  "imuldiv*17")
+
+(define_insn_reservation "generic_fcvt" 1
+  (eq_attr "type" "fcvt")
+  "alu")
+
+(define_insn_reservation "generic_fmove" 2
+  (eq_attr "type" "fabs,fneg,fmove")
+  "alu")
+
+(define_insn_reservation "generic_fcmp" 3
+  (eq_attr "type" "fcmp")
+  "alu")
+
+(define_insn_reservation "generic_fadd" 4
+  (eq_attr "type" "fadd")
+  "alu")
+
+(define_insn_reservation "generic_fmul_single" 7
+  (and (eq_attr "type" "fmul,fmadd")
+       (eq_attr "mode" "SF"))
+  "alu")
+
+(define_insn_reservation "generic_fmul_double" 8
+  (and (eq_attr "type" "fmul,fmadd")
+       (eq_attr "mode" "DF"))
+  "alu")
+
+(define_insn_reservation "generic_fdiv_single" 23
+  (and (eq_attr "type" "fdiv,frdiv")
+       (eq_attr "mode" "SF"))
+  "alu")
+
+(define_insn_reservation "generic_fdiv_double" 36
+  (and (eq_attr "type" "fdiv,frdiv")
+       (eq_attr "mode" "DF"))
+  "alu")
+
+(define_insn_reservation "generic_fsqrt_single" 54
+  (and (eq_attr "type" "fsqrt,frsqrt")
+       (eq_attr "mode" "SF"))
+  "alu")
+
+(define_insn_reservation "generic_fsqrt_double" 112
+  (and (eq_attr "type" "fsqrt,frsqrt")
+       (eq_attr "mode" "DF"))
+  "alu")
+
+(define_insn_reservation "generic_atomic" 10
+  (eq_attr "type" "atomic")
+  "alu")
+
+;; Sync loop consists of (in order)
+;; (1) optional sync,
+;; (2) LL instruction,
+;; (3) branch and 1-2 ALU instructions,
+;; (4) SC instruction,
+;; (5) branch and ALU instruction.
+;; The net result of this reservation is a big delay with a flush of
+;; ALU pipeline.
+(define_insn_reservation "generic_sync_loop" 40
+  (eq_attr "type" "syncloop")
+  "alu*39")
diff --git a/gcc/config/loongarch/genopts/genstr.sh b/gcc/config/loongarch/genopts/genstr.sh
new file mode 100755
index 0000000..972ef12
--- /dev/null
+++ b/gcc/config/loongarch/genopts/genstr.sh
@@ -0,0 +1,104 @@
+#!/bin/sh
+# A simple script that generates loongarch-str.h and loongarch.opt
+# from genopt/loongarch-optstr.
+#
+# Copyright (C) 2021-2022 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+#
+# GCC is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+# License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+cd "$(dirname "$0")"
+
+# Generate a header containing definitions from the string table.
+gen_defines() {
+    cat <<EOF
+/* Generated automatically by "genstr" from "loongarch-strings".
+   Please do not edit this file directly.
+
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef LOONGARCH_STR_H
+#define LOONGARCH_STR_H
+EOF
+
+    sed -e '/^$/n' -e 's@#.*$@@' -e '/^$/d' \
+	-e 's@^\([^ \t]\+\)[ \t]*\([^ \t]*\)@#define \1 "\2"@' \
+	loongarch-strings
+
+    echo
+    echo "#endif /* LOONGARCH_STR_H */"
+}
+
+
+# Substitute all "@@<KEY>@@" to "<VALUE>" in loongarch.opt.in
+# according to the key-value pairs defined in loongarch-strings.
+
+gen_options() {
+
+    sed -e '/^$/n' -e 's@#.*$@@' -e '/^$/d' \
+	-e 's@^\([^ \t]\+\)[ \t]*\([^ \t]*\)@\1="\2"@' \
+	loongarch-strings | { \
+
+	# read the definitions
+	while read -r line; do
+	    eval "$line"
+	done
+
+	# print a header
+	cat << EOF
+; Generated by "genstr" from the template "loongarch.opt.in"
+; and definitions from "loongarch-strings".
+;
+; Please do not edit this file directly.
+; It will be automatically updated during a gcc build
+; if you change "loongarch.opt.in" or "loongarch-strings".
+;
+EOF
+
+	# make the substitutions
+	sed -e 's@"@\\"@g' -e 's/@@\([^@]\+\)@@/${\1}/g' loongarch.opt.in | \
+	    while read -r line; do
+		eval "echo \"$line\""
+	    done
+    }
+}
+
+main() {
+    case "$1" in
+	header) gen_defines;;
+	opt) gen_options;;
+	*) echo "Unknown Command: \"$1\". Available: header, opt"; exit 1;;
+    esac
+}
+
+main "$@"
diff --git a/gcc/config/loongarch/genopts/loongarch-strings b/gcc/config/loongarch/genopts/loongarch-strings
new file mode 100644
index 0000000..cb88ed5
--- /dev/null
+++ b/gcc/config/loongarch/genopts/loongarch-strings
@@ -0,0 +1,58 @@
+# Defines the key strings for LoongArch compiler options.
+#
+# Copyright (C) 2021-2022 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+#
+# GCC is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+# License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# -march= / -mtune=
+OPTSTR_ARCH	      arch
+OPTSTR_TUNE	      tune
+
+STR_CPU_NATIVE	      native
+STR_CPU_LOONGARCH64   loongarch64
+STR_CPU_LA464	      la464
+
+# Base architecture
+STR_ISA_BASE_LA64V100 la64
+
+# -mfpu
+OPTSTR_ISA_EXT_FPU    fpu
+STR_ISA_EXT_NOFPU     none
+STR_ISA_EXT_FPU0      0
+STR_ISA_EXT_FPU32     32
+STR_ISA_EXT_FPU64     64
+
+OPTSTR_SOFT_FLOAT     soft-float
+OPTSTR_SINGLE_FLOAT   single-float
+OPTSTR_DOUBLE_FLOAT   double-float
+
+# -mabi=
+OPTSTR_ABI_BASE	      abi
+STR_ABI_BASE_LP64D    lp64d
+STR_ABI_BASE_LP64F    lp64f
+STR_ABI_BASE_LP64S    lp64s
+
+# ABI extension types
+STR_ABI_EXT_BASE      base
+
+# -mcmodel=
+OPTSTR_CMODEL	      cmodel
+STR_CMODEL_NORMAL     normal
+STR_CMODEL_TINY	      tiny
+STR_CMODEL_TS	      tiny-static
+STR_CMODEL_LARGE      large
+STR_CMODEL_EXTREME    extreme
diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in b/gcc/config/loongarch/genopts/loongarch.opt.in
new file mode 100644
index 0000000..61e7d72
--- /dev/null
+++ b/gcc/config/loongarch/genopts/loongarch.opt.in
@@ -0,0 +1,179 @@
+; Copyright (C) 2021-2022 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT
+; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+; License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; <http://www.gnu.org/licenses/>.
+;
+
+; Variables (macros) that should be exported by loongarch.opt:
+;   la_opt_switches,
+;   la_opt_abi_base, la_opt_abi_ext,
+;   la_opt_cpu_arch, la_opt_cpu_tune,
+;   la_opt_fpu,
+;   la_cmodel.
+
+HeaderInclude
+config/loongarch/loongarch-opts.h
+
+HeaderInclude
+config/loongarch/loongarch-str.h
+
+Variable
+HOST_WIDE_INT la_opt_switches = 0
+
+; ISA related options
+;; Base ISA
+Enum
+Name(isa_base) Type(int)
+Basic ISAs of LoongArch:
+
+EnumValue
+Enum(isa_base) String(@@STR_ISA_BASE_LA64V100@@) Value(ISA_BASE_LA64V100)
+
+
+;; ISA extensions / adjustments
+Enum
+Name(isa_ext_fpu) Type(int)
+FPU types of LoongArch:
+
+EnumValue
+Enum(isa_ext_fpu) String(@@STR_ISA_EXT_NOFPU@@) Value(ISA_EXT_NOFPU)
+
+EnumValue
+Enum(isa_ext_fpu) String(@@STR_ISA_EXT_FPU32@@) Value(ISA_EXT_FPU32)
+
+EnumValue
+Enum(isa_ext_fpu) String(@@STR_ISA_EXT_FPU64@@) Value(ISA_EXT_FPU64)
+
+m@@OPTSTR_ISA_EXT_FPU@@=
+Target RejectNegative Joined ToLower Enum(isa_ext_fpu) Var(la_opt_fpu) Init(M_OPTION_NOT_SEEN)
+-m@@OPTSTR_ISA_EXT_FPU@@=FPU	Generate code for the given FPU.
+
+m@@OPTSTR_ISA_EXT_FPU@@=@@STR_ISA_EXT_FPU0@@
+Target RejectNegative Alias(m@@OPTSTR_ISA_EXT_FPU@@=,@@STR_ISA_EXT_NOFPU@@)
+
+m@@OPTSTR_SOFT_FLOAT@@
+Target Driver RejectNegative Var(la_opt_switches) Mask(FORCE_SOFTF) Negative(m@@OPTSTR_SINGLE_FLOAT@@)
+Prevent the use of all hardware floating-point instructions.
+
+m@@OPTSTR_SINGLE_FLOAT@@
+Target Driver RejectNegative Var(la_opt_switches) Mask(FORCE_F32) Negative(m@@OPTSTR_DOUBLE_FLOAT@@)
+Restrict the use of hardware floating-point instructions to 32-bit operations.
+
+m@@OPTSTR_DOUBLE_FLOAT@@
+Target Driver RejectNegative Var(la_opt_switches) Mask(FORCE_F64) Negative(m@@OPTSTR_SOFT_FLOAT@@)
+Allow hardware floating-point instructions to cover both 32-bit and 64-bit operations.
+
+
+;; Base target models (implies ISA & tune parameters)
+Enum
+Name(cpu_type) Type(int)
+LoongArch CPU types:
+
+EnumValue
+Enum(cpu_type) String(@@STR_CPU_NATIVE@@) Value(CPU_NATIVE)
+
+EnumValue
+Enum(cpu_type) String(@@STR_CPU_LOONGARCH64@@) Value(CPU_LOONGARCH64)
+
+EnumValue
+Enum(cpu_type) String(@@STR_CPU_LA464@@) Value(CPU_LA464)
+
+m@@OPTSTR_ARCH@@=
+Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_arch) Init(M_OPTION_NOT_SEEN)
+-m@@OPTSTR_ARCH@@=PROCESSOR	Generate code for the given PROCESSOR ISA.
+
+m@@OPTSTR_TUNE@@=
+Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_tune) Init(M_OPTION_NOT_SEEN)
+-m@@OPTSTR_TUNE@@=PROCESSOR	Generate optimized code for PROCESSOR.
+
+
+; ABI related options
+; (ISA constraints on ABI are handled dynamically)
+
+;; Base ABI
+Enum
+Name(abi_base) Type(int)
+Base ABI types for LoongArch:
+
+EnumValue
+Enum(abi_base) String(@@STR_ABI_BASE_LP64D@@) Value(ABI_BASE_LP64D)
+
+EnumValue
+Enum(abi_base) String(@@STR_ABI_BASE_LP64F@@) Value(ABI_BASE_LP64F)
+
+EnumValue
+Enum(abi_base) String(@@STR_ABI_BASE_LP64S@@) Value(ABI_BASE_LP64S)
+
+m@@OPTSTR_ABI_BASE@@=
+Target RejectNegative Joined ToLower Enum(abi_base) Var(la_opt_abi_base) Init(M_OPTION_NOT_SEEN)
+-m@@OPTSTR_ABI_BASE@@=BASEABI	Generate code that conforms to the given BASEABI.
+
+;; ABI Extension
+Variable
+int la_opt_abi_ext = M_OPTION_NOT_SEEN
+
+
+mbranch-cost=
+Target RejectNegative Joined UInteger Var(loongarch_branch_cost)
+-mbranch-cost=COST	Set the cost of branches to roughly COST instructions.
+
+mcheck-zero-division
+Target Mask(CHECK_ZERO_DIV)
+Trap on integer divide by zero.
+
+mcond-move-int
+Target Var(TARGET_COND_MOVE_INT) Init(1)
+Conditional moves for integral are enabled.
+
+mcond-move-float
+Target Var(TARGET_COND_MOVE_FLOAT) Init(1)
+Conditional moves for float are enabled.
+
+mmemcpy
+Target Mask(MEMCPY)
+Prevent optimizing block moves, which is also the default behavior of -Os.
+
+mstrict-align
+Target Var(TARGET_STRICT_ALIGN) Init(0)
+Do not generate unaligned memory accesses.
+
+mmax-inline-memcpy-size=
+Target Joined RejectNegative UInteger Var(loongarch_max_inline_memcpy_size) Init(1024)
+-mmax-inline-memcpy-size=SIZE	Set the max size of memcpy to inline, default is 1024.
+
+; The code model option names for -mcmodel.
+Enum
+Name(cmodel) Type(int)
+The code model option names for -mcmodel:
+
+EnumValue
+Enum(cmodel) String(@@STR_CMODEL_NORMAL@@) Value(CMODEL_NORMAL)
+
+EnumValue
+Enum(cmodel) String(@@STR_CMODEL_TINY@@) Value(CMODEL_TINY)
+
+EnumValue
+Enum(cmodel) String(@@STR_CMODEL_TS@@) Value(CMODEL_TINY_STATIC)
+
+EnumValue
+Enum(cmodel) String(@@STR_CMODEL_LARGE@@) Value(CMODEL_LARGE)
+
+EnumValue
+Enum(cmodel) String(@@STR_CMODEL_EXTREME@@) Value(CMODEL_EXTREME)
+
+mcmodel=
+Target RejectNegative Joined Enum(cmodel) Var(la_opt_cmodel) Init(CMODEL_NORMAL)
+Specify the code model.
diff --git a/gcc/config/loongarch/gnu-user.h b/gcc/config/loongarch/gnu-user.h
new file mode 100644
index 0000000..664dc92
--- /dev/null
+++ b/gcc/config/loongarch/gnu-user.h
@@ -0,0 +1,80 @@
+/* Definitions for LoongArch systems using GNU (glibc-based) userspace,
+   or other userspace with libc derived from glibc.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Define the size of the wide character type.  */
+#undef WCHAR_TYPE
+#define WCHAR_TYPE "int"
+
+#undef WCHAR_TYPE_SIZE
+#define WCHAR_TYPE_SIZE 32
+
+
+/* GNU-specific SPEC definitions.  */
+#define GNU_USER_LINK_EMULATION "elf" ABI_GRLEN_SPEC "loongarch"
+
+#undef GLIBC_DYNAMIC_LINKER
+#define GLIBC_DYNAMIC_LINKER \
+  "/lib" ABI_GRLEN_SPEC "/ld-linux-loongarch-" ABI_SPEC ".so.1"
+
+#undef MUSL_DYNAMIC_LINKER
+#define MUSL_DYNAMIC_LINKER \
+  "/lib" ABI_GRLEN_SPEC "/ld-musl-loongarch-" ABI_SPEC ".so.1"
+
+#undef GNU_USER_TARGET_LINK_SPEC
+#define GNU_USER_TARGET_LINK_SPEC \
+  "%{G*} %{shared} -m " GNU_USER_LINK_EMULATION \
+  "%{!shared: %{static} %{!static: %{rdynamic:-export-dynamic} " \
+  "-dynamic-linker " GNU_USER_DYNAMIC_LINKER "}}"
+
+
+/* Similar to standard Linux, but adding -ffast-math support.  */
+#undef GNU_USER_TARGET_MATHFILE_SPEC
+#define GNU_USER_TARGET_MATHFILE_SPEC \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+
+#undef LIB_SPEC
+#define LIB_SPEC GNU_USER_TARGET_LIB_SPEC
+
+#undef LINK_SPEC
+#define LINK_SPEC GNU_USER_TARGET_LINK_SPEC
+
+#undef ENDFILE_SPEC
+#define ENDFILE_SPEC \
+  GNU_USER_TARGET_MATHFILE_SPEC " " \
+  GNU_USER_TARGET_ENDFILE_SPEC
+
+#undef SUBTARGET_CPP_SPEC
+#define SUBTARGET_CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{pthread:-D_REENTRANT}"
+
+/* A standard GNU/Linux mapping.  On most targets, it is included in
+   CC1_SPEC itself by config/linux.h, but loongarch.h overrides CC1_SPEC
+   and provides this hook instead.  */
+#undef SUBTARGET_CC1_SPEC
+#define SUBTARGET_CC1_SPEC GNU_USER_TARGET_CC1_SPEC
+
+#define TARGET_OS_CPP_BUILTINS() \
+  do \
+    { \
+      GNU_USER_TARGET_OS_CPP_BUILTINS (); \
+      /* The GNU C++ standard library requires this.  */ \
+      if (c_dialect_cxx ()) \
+       builtin_define ("_GNU_SOURCE"); \
+    } \
+  while (0)
diff --git a/gcc/config/loongarch/la464.md b/gcc/config/loongarch/la464.md
new file mode 100644
index 0000000..0ae1776
--- /dev/null
+++ b/gcc/config/loongarch/la464.md
@@ -0,0 +1,132 @@
+;; Pipeline model for LoongArch LA464 cores.
+
+;; Copyright (C) 2021-2022 Free Software Foundation, Inc.
+;; Contributed by Loongson Ltd.
+
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; Uncomment the following line to output automata for debugging.
+;; (automata_option "v")
+
+;; Automaton for integer instructions.
+(define_automaton "la464_a_alu")
+
+;; Automaton for floating-point instructions.
+(define_automaton "la464_a_falu")
+
+;; Automaton for memory operations.
+(define_automaton "la464_a_mem")
+
+;; Describe the resources.
+
+(define_cpu_unit "la464_alu1" "la464_a_alu")
+(define_cpu_unit "la464_alu2" "la464_a_alu")
+(define_cpu_unit "la464_mem1" "la464_a_mem")
+(define_cpu_unit "la464_mem2" "la464_a_mem")
+(define_cpu_unit "la464_falu1" "la464_a_falu")
+(define_cpu_unit "la464_falu2" "la464_a_falu")
+
+;; Describe instruction reservations.
+
+(define_insn_reservation "la464_arith" 1
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "arith,clz,const,logical,
+			move,nop,shift,signext,slt"))
+  "la464_alu1 | la464_alu2")
+
+(define_insn_reservation "la464_branch" 1
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "branch,jump,call,condmove,trap"))
+  "la464_alu1 | la464_alu2")
+
+(define_insn_reservation "la464_imul" 7
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "imul"))
+  "la464_alu1 | la464_alu2")
+
+(define_insn_reservation "la464_idiv_si" 12
+  (and (match_test "TARGET_TUNE_LA464")
+       (and (eq_attr "type" "idiv")
+	    (eq_attr "mode" "SI")))
+  "la464_alu1 | la464_alu2")
+
+(define_insn_reservation "la464_idiv_di" 25
+  (and (match_test "TARGET_TUNE_LA464")
+       (and (eq_attr "type" "idiv")
+	    (eq_attr "mode" "DI")))
+  "la464_alu1 | la464_alu2")
+
+(define_insn_reservation "la464_load" 4
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "load"))
+  "la464_mem1 | la464_mem2")
+
+(define_insn_reservation "la464_gpr_fp" 16
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "mftg,mgtf"))
+  "la464_mem1")
+
+(define_insn_reservation "la464_fpload" 4
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "fpload"))
+  "la464_mem1 | la464_mem2")
+
+(define_insn_reservation "la464_prefetch" 0
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "prefetch,prefetchx"))
+  "la464_mem1 | la464_mem2")
+
+(define_insn_reservation "la464_store" 0
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "store,fpstore,fpidxstore"))
+  "la464_mem1 | la464_mem2")
+
+(define_insn_reservation "la464_fadd" 4
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "fadd,fmul,fmadd"))
+  "la464_falu1 | la464_falu2")
+
+(define_insn_reservation "la464_fcmp" 2
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "fabs,fcmp,fmove,fneg"))
+  "la464_falu1 | la464_falu2")
+
+(define_insn_reservation "la464_fcvt" 4
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "fcvt"))
+  "la464_falu1 | la464_falu2")
+
+(define_insn_reservation "la464_fdiv_sf" 12
+  (and (match_test "TARGET_TUNE_LA464")
+       (and (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
+	    (eq_attr "mode" "SF")))
+  "la464_falu1 | la464_falu2")
+
+(define_insn_reservation "la464_fdiv_df" 19
+  (and (match_test "TARGET_TUNE_LA464")
+       (and (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")
+	    (eq_attr "mode" "DF")))
+  "la464_falu1 | la464_falu2")
+
+;; Force single-dispatch for unknown or multi.
+(define_insn_reservation "la464_unknown" 1
+  (and (match_test "TARGET_TUNE_LA464")
+       (eq_attr "type" "unknown,multi,atomic,syncloop"))
+  "la464_alu1 + la464_alu2 + la464_falu1
+   + la464_falu2 + la464_mem1 + la464_mem2")
+
+;; End of DFA-based pipeline description for la464
diff --git a/gcc/config/loongarch/larchintrin.h b/gcc/config/loongarch/larchintrin.h
new file mode 100644
index 0000000..2833f14
--- /dev/null
+++ b/gcc/config/loongarch/larchintrin.h
@@ -0,0 +1,355 @@
+/* Intrinsics for LoongArch BASE operations.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published
+by the Free Software Foundation; either version 3, or (at your
+option) any later version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef _GCC_LOONGARCH_BASE_INTRIN_H
+#define _GCC_LOONGARCH_BASE_INTRIN_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct drdtime
+{
+  unsigned long dvalue;
+  unsigned long dtimeid;
+} __drdtime_t;
+
+typedef struct rdtime
+{
+  unsigned int value;
+  unsigned int timeid;
+} __rdtime_t;
+
+#ifdef __loongarch64
+extern __inline __drdtime_t
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__rdtime_d (void)
+{
+  __drdtime_t __drdtime;
+  __asm__ volatile (
+    "rdtime.d\t%[val],%[tid]\n\t"
+    : [val]"=&r"(__drdtime.dvalue),[tid]"=&r"(__drdtime.dtimeid)
+    :);
+  return __drdtime;
+}
+#endif
+
+extern __inline __rdtime_t
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__rdtimeh_w (void)
+{
+  __rdtime_t __rdtime;
+  __asm__ volatile (
+    "rdtimeh.w\t%[val],%[tid]\n\t"
+    : [val]"=&r"(__rdtime.value),[tid]"=&r"(__rdtime.timeid)
+    :);
+  return __rdtime;
+}
+
+extern __inline __rdtime_t
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__rdtimel_w (void)
+{
+  __rdtime_t __rdtime;
+  __asm__ volatile (
+    "rdtimel.w\t%[val],%[tid]\n\t"
+    : [val]"=&r"(__rdtime.value),[tid]"=&r"(__rdtime.timeid)
+    :);
+  return __rdtime;
+}
+
+/* Assembly instruction format:	rj, fcsr.  */
+/* Data types in instruction templates:  USI, UQI.  */
+#define __movfcsr2gr(/*ui5*/ _1) __builtin_loongarch_movfcsr2gr ((_1));
+
+/* Assembly instruction format:	fcsr, rj.  */
+/* Data types in instruction templates:  VOID, UQI, USI.  */
+#define __movgr2fcsr(/*ui5*/ _1, _2) \
+  __builtin_loongarch_movgr2fcsr ((_1), (unsigned int) _2);
+
+#if defined __loongarch64
+/* Assembly instruction format:	ui5, rj, si12.  */
+/* Data types in instruction templates:  VOID, USI, UDI, SI.  */
+#define __cacop_d(/*ui5*/ _1, /*unsigned long int*/ _2, /*si12*/ _3) \
+  ((void) __builtin_loongarch_cacop_d ((_1), (unsigned long int) (_2), (_3)))
+#else
+#error "Unsupported ABI."
+#endif
+
+/* Assembly instruction format:	rd, rj.  */
+/* Data types in instruction templates:  USI, USI.  */
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__cpucfg (unsigned int _1)
+{
+  return (unsigned int) __builtin_loongarch_cpucfg ((unsigned int) _1);
+}
+
+#ifdef __loongarch64
+/* Assembly instruction format:	rj, rk.  */
+/* Data types in instruction templates:  DI, DI.  */
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__asrtle_d (long int _1, long int _2)
+{
+  __builtin_loongarch_asrtle_d ((long int) _1, (long int) _2);
+}
+
+/* Assembly instruction format:	rj, rk.  */
+/* Data types in instruction templates:  DI, DI.  */
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__asrtgt_d (long int _1, long int _2)
+{
+  __builtin_loongarch_asrtgt_d ((long int) _1, (long int) _2);
+}
+#endif
+
+#if defined __loongarch64
+/* Assembly instruction format:	rd, rj, ui5.  */
+/* Data types in instruction templates:  DI, DI, UQI.  */
+#define __lddir_d(/*long int*/ _1, /*ui5*/ _2) \
+  ((long int) __builtin_loongarch_lddir_d ((long int) (_1), (_2)))
+#else
+#error "Unsupported ABI."
+#endif
+
+#if defined __loongarch64
+/* Assembly instruction format:	rj, ui5.  */
+/* Data types in instruction templates:  VOID, DI, UQI.  */
+#define __ldpte_d(/*long int*/ _1, /*ui5*/ _2) \
+  ((void) __builtin_loongarch_ldpte_d ((long int) (_1), (_2)))
+#else
+#error "Unsupported ABI."
+#endif
+
+/* Assembly instruction format:	rd, rj, rk.  */
+/* Data types in instruction templates:  SI, QI, SI.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__crc_w_b_w (char _1, int _2)
+{
+  return (int) __builtin_loongarch_crc_w_b_w ((char) _1, (int) _2);
+}
+
+/* Assembly instruction format:	rd, rj, rk.  */
+/* Data types in instruction templates:  SI, HI, SI.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__crc_w_h_w (short _1, int _2)
+{
+  return (int) __builtin_loongarch_crc_w_h_w ((short) _1, (int) _2);
+}
+
+/* Assembly instruction format:	rd, rj, rk.  */
+/* Data types in instruction templates:  SI, SI, SI.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__crc_w_w_w (int _1, int _2)
+{
+  return (int) __builtin_loongarch_crc_w_w_w ((int) _1, (int) _2);
+}
+
+#ifdef __loongarch64
+/* Assembly instruction format:	rd, rj, rk.  */
+/* Data types in instruction templates:  SI, DI, SI.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__crc_w_d_w (long int _1, int _2)
+{
+  return (int) __builtin_loongarch_crc_w_d_w ((long int) _1, (int) _2);
+}
+#endif
+
+/* Assembly instruction format:	rd, rj, rk.  */
+/* Data types in instruction templates:  SI, QI, SI.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__crcc_w_b_w (char _1, int _2)
+{
+  return (int) __builtin_loongarch_crcc_w_b_w ((char) _1, (int) _2);
+}
+
+/* Assembly instruction format:	rd, rj, rk.  */
+/* Data types in instruction templates:  SI, HI, SI.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__crcc_w_h_w (short _1, int _2)
+{
+  return (int) __builtin_loongarch_crcc_w_h_w ((short) _1, (int) _2);
+}
+
+/* Assembly instruction format:	rd, rj, rk.  */
+/* Data types in instruction templates:  SI, SI, SI.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__crcc_w_w_w (int _1, int _2)
+{
+  return (int) __builtin_loongarch_crcc_w_w_w ((int) _1, (int) _2);
+}
+
+#ifdef __loongarch64
+/* Assembly instruction format:	rd, rj, rk.  */
+/* Data types in instruction templates:  SI, DI, SI.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__crcc_w_d_w (long int _1, int _2)
+{
+  return (int) __builtin_loongarch_crcc_w_d_w ((long int) _1, (int) _2);
+}
+#endif
+
+/* Assembly instruction format:	rd, ui14.  */
+/* Data types in instruction templates:  USI, USI.  */
+#define __csrrd_w(/*ui14*/ _1) \
+  ((unsigned int) __builtin_loongarch_csrrd_w ((_1)))
+
+/* Assembly instruction format:	rd, ui14.  */
+/* Data types in instruction templates:  USI, USI, USI.  */
+#define __csrwr_w(/*unsigned int*/ _1, /*ui14*/ _2) \
+  ((unsigned int) __builtin_loongarch_csrwr_w ((unsigned int) (_1), (_2)))
+
+/* Assembly instruction format:	rd, rj, ui14.  */
+/* Data types in instruction templates:  USI, USI, USI, USI.  */
+#define __csrxchg_w(/*unsigned int*/ _1, /*unsigned int*/ _2, /*ui14*/ _3) \
+  ((unsigned int) __builtin_loongarch_csrxchg_w ((unsigned int) (_1), \
+					       (unsigned int) (_2), (_3)))
+
+#ifdef __loongarch64
+/* Assembly instruction format:	rd, ui14.  */
+/* Data types in instruction templates:  UDI, USI.  */
+#define __csrrd_d(/*ui14*/ _1) \
+  ((unsigned long int) __builtin_loongarch_csrrd_d ((_1)))
+
+/* Assembly instruction format:	rd, ui14.  */
+/* Data types in instruction templates:  UDI, UDI, USI.  */
+#define __csrwr_d(/*unsigned long int*/ _1, /*ui14*/ _2) \
+  ((unsigned long int) __builtin_loongarch_csrwr_d ((unsigned long int) (_1), \
+						   (_2)))
+
+/* Assembly instruction format:	rd, rj, ui14.  */
+/* Data types in instruction templates:  UDI, UDI, UDI, USI.  */
+#define __csrxchg_d(/*unsigned long int*/ _1, /*unsigned long int*/ _2, \
+		   /*ui14*/ _3) \
+  ((unsigned long int) __builtin_loongarch_csrxchg_d ( \
+    (unsigned long int) (_1), (unsigned long int) (_2), (_3)))
+#endif
+
+/* Assembly instruction format:	rd, rj.  */
+/* Data types in instruction templates:  UQI, USI.  */
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__iocsrrd_b (unsigned int _1)
+{
+  return (unsigned char) __builtin_loongarch_iocsrrd_b ((unsigned int) _1);
+}
+
+/* Assembly instruction format:	rd, rj.  */
+/* Data types in instruction templates:  UHI, USI.  */
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__iocsrrd_h (unsigned int _1)
+{
+  return (unsigned short) __builtin_loongarch_iocsrrd_h ((unsigned int) _1);
+}
+
+/* Assembly instruction format:	rd, rj.  */
+/* Data types in instruction templates:  USI, USI.  */
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__iocsrrd_w (unsigned int _1)
+{
+  return (unsigned int) __builtin_loongarch_iocsrrd_w ((unsigned int) _1);
+}
+
+#ifdef __loongarch64
+/* Assembly instruction format:	rd, rj.  */
+/* Data types in instruction templates:  UDI, USI.  */
+extern __inline unsigned long int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__iocsrrd_d (unsigned int _1)
+{
+  return (unsigned long int) __builtin_loongarch_iocsrrd_d ((unsigned int) _1);
+}
+#endif
+
+/* Assembly instruction format:	rd, rj.  */
+/* Data types in instruction templates:  VOID, UQI, USI.  */
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__iocsrwr_b (unsigned char _1, unsigned int _2)
+{
+  __builtin_loongarch_iocsrwr_b ((unsigned char) _1, (unsigned int) _2);
+}
+
+/* Assembly instruction format:	rd, rj.  */
+/* Data types in instruction templates:  VOID, UHI, USI.  */
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__iocsrwr_h (unsigned short _1, unsigned int _2)
+{
+  __builtin_loongarch_iocsrwr_h ((unsigned short) _1, (unsigned int) _2);
+}
+
+/* Assembly instruction format:	rd, rj.  */
+/* Data types in instruction templates:  VOID, USI, USI.  */
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__iocsrwr_w (unsigned int _1, unsigned int _2)
+{
+  __builtin_loongarch_iocsrwr_w ((unsigned int) _1, (unsigned int) _2);
+}
+
+#ifdef __loongarch64
+/* Assembly instruction format:	rd, rj.  */
+/* Data types in instruction templates:  VOID, UDI, USI.  */
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__iocsrwr_d (unsigned long int _1, unsigned int _2)
+{
+  __builtin_loongarch_iocsrwr_d ((unsigned long int) _1, (unsigned int) _2);
+}
+#endif
+
+/* Assembly instruction format:	ui15.  */
+/* Data types in instruction templates:  USI.  */
+#define __dbar(/*ui15*/ _1) __builtin_loongarch_dbar ((_1))
+
+/* Assembly instruction format:	ui15.  */
+/* Data types in instruction templates:  USI.  */
+#define __ibar(/*ui15*/ _1) __builtin_loongarch_ibar ((_1))
+
+/* Assembly instruction format:	ui15.  */
+/* Data types in instruction templates:  USI.  */
+#define __syscall(/*ui15*/ _1) __builtin_loongarch_syscall ((_1))
+
+/* Assembly instruction format:	ui15.  */
+/* Data types in instruction templates:  USI.  */
+#define __break(/*ui15*/ _1) __builtin_loongarch_break ((_1))
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _GCC_LOONGARCH_BASE_INTRIN_H */
diff --git a/gcc/config/loongarch/linux.h b/gcc/config/loongarch/linux.h
new file mode 100644
index 0000000..110d0fa
--- /dev/null
+++ b/gcc/config/loongarch/linux.h
@@ -0,0 +1,50 @@
+/* Definitions for Linux-based systems with libraries in ELF format.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Default system library search paths.
+ * This ensures that a compiler configured with --disable-multilib
+ * can work in a multilib environment.  */
+
+#if defined(LA_DISABLE_MULTILIB) && defined(LA_DISABLE_MULTIARCH)
+
+  #if DEFAULT_ABI_BASE == ABI_BASE_LP64D
+    #define ABI_LIBDIR "lib64"
+  #elif DEFAULT_ABI_BASE == ABI_BASE_LP64F
+    #define ABI_LIBDIR "lib64/f32"
+  #elif DEFAULT_ABI_BASE == ABI_BASE_LP64S
+    #define ABI_LIBDIR "lib64/sf"
+  #endif
+
+#endif
+
+#ifndef ABI_LIBDIR
+#define ABI_LIBDIR "lib"
+#endif
+
+#define STANDARD_STARTFILE_PREFIX_1 "/" ABI_LIBDIR "/"
+#define STANDARD_STARTFILE_PREFIX_2 "/usr/" ABI_LIBDIR "/"
+
+
+/* Define this to be nonzero if static stack checking is supported.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
+
+/* The default value isn't sufficient in 64-bit mode.  */
+#define STACK_CHECK_PROTECT (TARGET_64BIT ? 16 * 1024 : 12 * 1024)
+
+#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
diff --git a/gcc/config/loongarch/loongarch-builtins.cc b/gcc/config/loongarch/loongarch-builtins.cc
new file mode 100644
index 0000000..64fe111
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-builtins.cc
@@ -0,0 +1,424 @@
+/* Subroutines used for expanding LoongArch builtins.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "gimple.h"
+#include "tm_p.h"
+#include "optabs.h"
+#include "recog.h"
+#include "diagnostic.h"
+#include "fold-const.h"
+#include "expr.h"
+#include "langhooks.h"
+
+/* Macros to create an enumeration identifier for a function prototype.  */
+#define LARCH_FTYPE_NAME1(A, B) LARCH_##A##_FTYPE_##B
+#define LARCH_FTYPE_NAME2(A, B, C) LARCH_##A##_FTYPE_##B##_##C
+#define LARCH_FTYPE_NAME3(A, B, C, D) LARCH_##A##_FTYPE_##B##_##C##_##D
+
+/* Classifies the prototype of a built-in function.  */
+enum loongarch_function_type
+{
+#define DEF_LARCH_FTYPE(NARGS, LIST) LARCH_FTYPE_NAME##NARGS LIST,
+#include "config/loongarch/loongarch-ftypes.def"
+#undef DEF_LARCH_FTYPE
+  LARCH_MAX_FTYPE_MAX
+};
+
+/* Specifies how a built-in function should be converted into rtl.  */
+enum loongarch_builtin_type
+{
+  /* The function corresponds directly to an .md pattern.  The return
+     value is mapped to operand 0 and the arguments are mapped to
+     operands 1 and above.  */
+  LARCH_BUILTIN_DIRECT,
+
+  /* The function corresponds directly to an .md pattern.  There is no return
+     value and the arguments are mapped to operands 0 and above.  */
+  LARCH_BUILTIN_DIRECT_NO_TARGET,
+
+};
+
+/* Declare an availability predicate for built-in functions that require
+ * COND to be true.  NAME is the main part of the predicate's name.  */
+#define AVAIL_ALL(NAME, COND) \
+  static unsigned int \
+  loongarch_builtin_avail_##NAME (void) \
+  { \
+    return (COND) ? 1 : 0; \
+  }
+
+static unsigned int
+loongarch_builtin_avail_default (void)
+{
+  return 1;
+}
+/* This structure describes a single built-in function.  */
+struct loongarch_builtin_description
+{
+  /* The code of the main .md file instruction.  See loongarch_builtin_type
+     for more information.  */
+  enum insn_code icode;
+
+  /* The name of the built-in function.  */
+  const char *name;
+
+  /* Specifies how the function should be expanded.  */
+  enum loongarch_builtin_type builtin_type;
+
+  /* The function's prototype.  */
+  enum loongarch_function_type function_type;
+
+  /* Whether the function is available.  */
+  unsigned int (*avail) (void);
+};
+
+AVAIL_ALL (hard_float, TARGET_HARD_FLOAT_ABI)
+
+/* Construct a loongarch_builtin_description from the given arguments.
+
+   INSN is the name of the associated instruction pattern, without the
+   leading CODE_FOR_loongarch_.
+
+   CODE is the floating-point condition code associated with the
+   function.  It can be 'f' if the field is not applicable.
+
+   NAME is the name of the function itself, without the leading
+   "__builtin_loongarch_".
+
+   BUILTIN_TYPE and FUNCTION_TYPE are loongarch_builtin_description fields.
+
+   AVAIL is the name of the availability predicate, without the leading
+   loongarch_builtin_avail_.  */
+#define LARCH_BUILTIN(INSN, NAME, BUILTIN_TYPE, FUNCTION_TYPE, AVAIL) \
+  { \
+    CODE_FOR_loongarch_##INSN, "__builtin_loongarch_" NAME, \
+      BUILTIN_TYPE, FUNCTION_TYPE, \
+      loongarch_builtin_avail_##AVAIL \
+  }
+
+/* Define __builtin_loongarch_<INSN>, which is a LARCH_BUILTIN_DIRECT function
+   mapped to instruction CODE_FOR_loongarch_<INSN>,  FUNCTION_TYPE and AVAIL
+   are as for LARCH_BUILTIN.  */
+#define DIRECT_BUILTIN(INSN, FUNCTION_TYPE, AVAIL) \
+  LARCH_BUILTIN (INSN, #INSN, LARCH_BUILTIN_DIRECT, FUNCTION_TYPE, AVAIL)
+
+/* Define __builtin_loongarch_<INSN>, which is a LARCH_BUILTIN_DIRECT_NO_TARGET
+   function mapped to instruction CODE_FOR_loongarch_<INSN>,  FUNCTION_TYPE
+   and AVAIL are as for LARCH_BUILTIN.  */
+#define DIRECT_NO_TARGET_BUILTIN(INSN, FUNCTION_TYPE, AVAIL) \
+  LARCH_BUILTIN (INSN, #INSN, LARCH_BUILTIN_DIRECT_NO_TARGET, \
+		 FUNCTION_TYPE, AVAIL)
+
+static const struct loongarch_builtin_description loongarch_builtins[] = {
+#define LARCH_MOVFCSR2GR 0
+  DIRECT_BUILTIN (movfcsr2gr, LARCH_USI_FTYPE_UQI, hard_float),
+#define LARCH_MOVGR2FCSR 1
+  DIRECT_NO_TARGET_BUILTIN (movgr2fcsr, LARCH_VOID_FTYPE_UQI_USI, hard_float),
+
+  DIRECT_NO_TARGET_BUILTIN (cacop_w, LARCH_VOID_FTYPE_USI_USI_SI, default),
+  DIRECT_NO_TARGET_BUILTIN (cacop_d, LARCH_VOID_FTYPE_USI_UDI_SI, default),
+  DIRECT_NO_TARGET_BUILTIN (dbar, LARCH_VOID_FTYPE_USI, default),
+  DIRECT_NO_TARGET_BUILTIN (ibar, LARCH_VOID_FTYPE_USI, default),
+
+  DIRECT_BUILTIN (lddir_d, LARCH_DI_FTYPE_DI_UQI, default),
+  DIRECT_BUILTIN (lddir_w, LARCH_SI_FTYPE_SI_UQI, default),
+  DIRECT_NO_TARGET_BUILTIN (ldpte_d, LARCH_VOID_FTYPE_DI_UQI, default),
+  DIRECT_NO_TARGET_BUILTIN (ldpte_w, LARCH_VOID_FTYPE_SI_UQI, default),
+
+  /* CRC Instrinsic */
+
+  DIRECT_BUILTIN (crc_w_b_w, LARCH_SI_FTYPE_QI_SI, default),
+  DIRECT_BUILTIN (crc_w_h_w, LARCH_SI_FTYPE_HI_SI, default),
+  DIRECT_BUILTIN (crc_w_w_w, LARCH_SI_FTYPE_SI_SI, default),
+  DIRECT_BUILTIN (crc_w_d_w, LARCH_SI_FTYPE_DI_SI, default),
+  DIRECT_BUILTIN (crcc_w_b_w, LARCH_SI_FTYPE_QI_SI, default),
+  DIRECT_BUILTIN (crcc_w_h_w, LARCH_SI_FTYPE_HI_SI, default),
+  DIRECT_BUILTIN (crcc_w_w_w, LARCH_SI_FTYPE_SI_SI, default),
+  DIRECT_BUILTIN (crcc_w_d_w, LARCH_SI_FTYPE_DI_SI, default),
+
+  DIRECT_BUILTIN (csrrd_w, LARCH_USI_FTYPE_USI, default),
+  DIRECT_BUILTIN (csrrd_d, LARCH_UDI_FTYPE_USI, default),
+  DIRECT_BUILTIN (csrwr_w, LARCH_USI_FTYPE_USI_USI, default),
+  DIRECT_BUILTIN (csrwr_d, LARCH_UDI_FTYPE_UDI_USI, default),
+  DIRECT_BUILTIN (csrxchg_w, LARCH_USI_FTYPE_USI_USI_USI, default),
+  DIRECT_BUILTIN (csrxchg_d, LARCH_UDI_FTYPE_UDI_UDI_USI, default),
+  DIRECT_BUILTIN (iocsrrd_b, LARCH_UQI_FTYPE_USI, default),
+  DIRECT_BUILTIN (iocsrrd_h, LARCH_UHI_FTYPE_USI, default),
+  DIRECT_BUILTIN (iocsrrd_w, LARCH_USI_FTYPE_USI, default),
+  DIRECT_BUILTIN (iocsrrd_d, LARCH_UDI_FTYPE_USI, default),
+  DIRECT_NO_TARGET_BUILTIN (iocsrwr_b, LARCH_VOID_FTYPE_UQI_USI, default),
+  DIRECT_NO_TARGET_BUILTIN (iocsrwr_h, LARCH_VOID_FTYPE_UHI_USI, default),
+  DIRECT_NO_TARGET_BUILTIN (iocsrwr_w, LARCH_VOID_FTYPE_USI_USI, default),
+  DIRECT_NO_TARGET_BUILTIN (iocsrwr_d, LARCH_VOID_FTYPE_UDI_USI, default),
+
+  DIRECT_BUILTIN (cpucfg, LARCH_USI_FTYPE_USI, default),
+  DIRECT_NO_TARGET_BUILTIN (asrtle_d, LARCH_VOID_FTYPE_DI_DI, default),
+  DIRECT_NO_TARGET_BUILTIN (asrtgt_d, LARCH_VOID_FTYPE_DI_DI, default),
+  DIRECT_NO_TARGET_BUILTIN (syscall, LARCH_VOID_FTYPE_USI, default),
+  DIRECT_NO_TARGET_BUILTIN (break, LARCH_VOID_FTYPE_USI, default),
+};
+
+/* Index I is the function declaration for loongarch_builtins[I], or null if
+   the function isn't defined on this target.  */
+static GTY (()) tree loongarch_builtin_decls[ARRAY_SIZE (loongarch_builtins)];
+/* Get the index I of the function declaration for loongarch_builtin_decls[I]
+   using the instruction code or return null if not defined for the target.  */
+static GTY (()) int loongarch_get_builtin_decl_index[NUM_INSN_CODES];
+
+/* Source-level argument types.  */
+#define LARCH_ATYPE_VOID void_type_node
+#define LARCH_ATYPE_INT integer_type_node
+#define LARCH_ATYPE_POINTER ptr_type_node
+
+/* Standard mode-based argument types.  */
+#define LARCH_ATYPE_QI intQI_type_node
+#define LARCH_ATYPE_UQI unsigned_intQI_type_node
+#define LARCH_ATYPE_HI intHI_type_node
+#define LARCH_ATYPE_UHI unsigned_intHI_type_node
+#define LARCH_ATYPE_SI intSI_type_node
+#define LARCH_ATYPE_USI unsigned_intSI_type_node
+#define LARCH_ATYPE_DI intDI_type_node
+#define LARCH_ATYPE_UDI unsigned_intDI_type_node
+#define LARCH_ATYPE_SF float_type_node
+#define LARCH_ATYPE_DF double_type_node
+
+/* LARCH_FTYPE_ATYPESN takes N LARCH_FTYPES-like type codes and lists
+   their associated LARCH_ATYPEs.  */
+#define LARCH_FTYPE_ATYPES1(A, B) LARCH_ATYPE_##A, LARCH_ATYPE_##B
+
+#define LARCH_FTYPE_ATYPES2(A, B, C) \
+  LARCH_ATYPE_##A, LARCH_ATYPE_##B, LARCH_ATYPE_##C
+
+#define LARCH_FTYPE_ATYPES3(A, B, C, D) \
+  LARCH_ATYPE_##A, LARCH_ATYPE_##B, LARCH_ATYPE_##C, LARCH_ATYPE_##D
+
+#define LARCH_FTYPE_ATYPES4(A, B, C, D, E) \
+  LARCH_ATYPE_##A, LARCH_ATYPE_##B, LARCH_ATYPE_##C, LARCH_ATYPE_##D, \
+  LARCH_ATYPE_##E
+
+/* Return the function type associated with function prototype TYPE.  */
+
+static tree
+loongarch_build_function_type (enum loongarch_function_type type)
+{
+  static tree types[(int) LARCH_MAX_FTYPE_MAX];
+
+  if (types[(int) type] == NULL_TREE)
+    switch (type)
+      {
+#define DEF_LARCH_FTYPE(NUM, ARGS) \
+  case LARCH_FTYPE_NAME##NUM ARGS: \
+    types[(int) type] \
+      = build_function_type_list (LARCH_FTYPE_ATYPES##NUM ARGS, NULL_TREE); \
+    break;
+#include "config/loongarch/loongarch-ftypes.def"
+#undef DEF_LARCH_FTYPE
+      default:
+	gcc_unreachable ();
+      }
+
+  return types[(int) type];
+}
+
+/* Implement TARGET_INIT_BUILTINS.  */
+
+void
+loongarch_init_builtins (void)
+{
+  const struct loongarch_builtin_description *d;
+  unsigned int i;
+  tree type;
+
+  /* Iterate through all of the bdesc arrays, initializing all of the
+     builtin functions.  */
+  for (i = 0; i < ARRAY_SIZE (loongarch_builtins); i++)
+    {
+      d = &loongarch_builtins[i];
+      if (d->avail ())
+	{
+	  type = loongarch_build_function_type (d->function_type);
+	  loongarch_builtin_decls[i]
+	    = add_builtin_function (d->name, type, i, BUILT_IN_MD, NULL,
+				    NULL);
+	  loongarch_get_builtin_decl_index[d->icode] = i;
+	}
+    }
+}
+
+/* Implement TARGET_BUILTIN_DECL.  */
+
+tree
+loongarch_builtin_decl (unsigned int code, bool initialize_p ATTRIBUTE_UNUSED)
+{
+  if (code >= ARRAY_SIZE (loongarch_builtins))
+    return error_mark_node;
+  return loongarch_builtin_decls[code];
+}
+
+/* Take argument ARGNO from EXP's argument list and convert it into
+   an expand operand.  Store the operand in *OP.  */
+
+static void
+loongarch_prepare_builtin_arg (struct expand_operand *op, tree exp,
+			       unsigned int argno)
+{
+  tree arg;
+  rtx value;
+
+  arg = CALL_EXPR_ARG (exp, argno);
+  value = expand_normal (arg);
+  create_input_operand (op, value, TYPE_MODE (TREE_TYPE (arg)));
+}
+
+/* Expand instruction ICODE as part of a built-in function sequence.
+   Use the first NOPS elements of OPS as the instruction's operands.
+   HAS_TARGET_P is true if operand 0 is a target; it is false if the
+   instruction has no target.
+
+   Return the target rtx if HAS_TARGET_P, otherwise return const0_rtx.  */
+
+static rtx
+loongarch_expand_builtin_insn (enum insn_code icode, unsigned int nops,
+			       struct expand_operand *ops, bool has_target_p)
+{
+  if (!maybe_expand_insn (icode, nops, ops))
+    {
+      error ("invalid argument to built-in function");
+      return has_target_p ? gen_reg_rtx (ops[0].mode) : const0_rtx;
+    }
+  return has_target_p ? ops[0].value : const0_rtx;
+}
+
+/* Expand a LARCH_BUILTIN_DIRECT or LARCH_BUILTIN_DIRECT_NO_TARGET function;
+   HAS_TARGET_P says which.  EXP is the CALL_EXPR that calls the function
+   and ICODE is the code of the associated .md pattern.  TARGET, if nonnull,
+   suggests a good place to put the result.  */
+
+static rtx
+loongarch_expand_builtin_direct (enum insn_code icode, rtx target, tree exp,
+				 bool has_target_p)
+{
+  struct expand_operand ops[MAX_RECOG_OPERANDS];
+  int opno, argno;
+
+  /* Map any target to operand 0.  */
+  opno = 0;
+  if (has_target_p)
+    create_output_operand (&ops[opno++], target, TYPE_MODE (TREE_TYPE (exp)));
+
+  /* Map the arguments to the other operands.  */
+  gcc_assert (opno + call_expr_nargs (exp)
+	      == insn_data[icode].n_generator_args);
+  for (argno = 0; argno < call_expr_nargs (exp); argno++)
+    loongarch_prepare_builtin_arg (&ops[opno++], exp, argno);
+
+  return loongarch_expand_builtin_insn (icode, opno, ops, has_target_p);
+}
+
+/* Implement TARGET_EXPAND_BUILTIN.  */
+
+rtx
+loongarch_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
+			  machine_mode mode ATTRIBUTE_UNUSED,
+			  int ignore ATTRIBUTE_UNUSED)
+{
+  tree fndecl;
+  unsigned int fcode, avail;
+  const struct loongarch_builtin_description *d;
+
+  fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+  fcode = DECL_MD_FUNCTION_CODE (fndecl);
+  gcc_assert (fcode < ARRAY_SIZE (loongarch_builtins));
+  d = &loongarch_builtins[fcode];
+  avail = d->avail ();
+  gcc_assert (avail != 0);
+  switch (d->builtin_type)
+    {
+    case LARCH_BUILTIN_DIRECT:
+      return loongarch_expand_builtin_direct (d->icode, target, exp, true);
+
+    case LARCH_BUILTIN_DIRECT_NO_TARGET:
+      return loongarch_expand_builtin_direct (d->icode, target, exp, false);
+    }
+  gcc_unreachable ();
+}
+
+/* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV.  */
+
+void
+loongarch_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
+{
+  if (!TARGET_HARD_FLOAT_ABI)
+    return;
+  tree exceptions_var = create_tmp_var_raw (LARCH_ATYPE_USI);
+  tree fcsr_orig_var = create_tmp_var_raw (LARCH_ATYPE_USI);
+  tree fcsr_mod_var = create_tmp_var_raw (LARCH_ATYPE_USI);
+  tree const0 = build_int_cst (LARCH_ATYPE_UQI, 0);
+  tree get_fcsr = loongarch_builtin_decls[LARCH_MOVFCSR2GR];
+  tree set_fcsr = loongarch_builtin_decls[LARCH_MOVGR2FCSR];
+  tree get_fcsr_hold_call = build_call_expr (get_fcsr, 1, const0);
+  tree hold_assign_orig = build4 (TARGET_EXPR, LARCH_ATYPE_USI,
+				  fcsr_orig_var, get_fcsr_hold_call,
+				  NULL, NULL);
+  tree hold_mod_val = build2 (BIT_AND_EXPR, LARCH_ATYPE_USI, fcsr_orig_var,
+			      build_int_cst (LARCH_ATYPE_USI, 0xffe0ffe0));
+  tree hold_assign_mod = build4 (TARGET_EXPR, LARCH_ATYPE_USI,
+				 fcsr_mod_var, hold_mod_val, NULL, NULL);
+  tree set_fcsr_hold_call = build_call_expr (set_fcsr, 2, const0,
+					     fcsr_mod_var);
+  tree hold_all = build2 (COMPOUND_EXPR, LARCH_ATYPE_USI, hold_assign_orig,
+			  hold_assign_mod);
+  *hold = build2 (COMPOUND_EXPR, void_type_node, hold_all, set_fcsr_hold_call);
+
+  *clear = build_call_expr (set_fcsr, 2, const0, fcsr_mod_var);
+
+  tree get_fcsr_update_call = build_call_expr (get_fcsr, 1, const0);
+  *update = build4 (TARGET_EXPR, LARCH_ATYPE_USI, exceptions_var,
+		    get_fcsr_update_call, NULL, NULL);
+  tree set_fcsr_update_call = build_call_expr (set_fcsr, 2, const0,
+					       fcsr_orig_var);
+  *update = build2 (COMPOUND_EXPR, void_type_node, *update,
+		    set_fcsr_update_call);
+  tree atomic_feraiseexcept
+    = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
+  tree int_exceptions_var = fold_convert (integer_type_node, exceptions_var);
+  tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept, 1,
+						    int_exceptions_var);
+  *update = build2 (COMPOUND_EXPR, void_type_node, *update,
+		    atomic_feraiseexcept_call);
+}
+
+/* Implement TARGET_BUILTIN_VA_LIST.  */
+
+tree
+loongarch_build_builtin_va_list (void)
+{
+  return ptr_type_node;
+}
diff --git a/gcc/config/loongarch/loongarch-c.cc b/gcc/config/loongarch/loongarch-c.cc
new file mode 100644
index 0000000..d6e3e19
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-c.cc
@@ -0,0 +1,109 @@
+/* LoongArch-specific code for C family languages.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "c-family/c-common.h"
+#include "cpplib.h"
+
+#define preprocessing_asm_p() (cpp_get_options (pfile)->lang == CLK_ASM)
+#define builtin_define(TXT) cpp_define (pfile, TXT)
+#define builtin_assert(TXT) cpp_assert (pfile, TXT)
+
+/* Define preprocessor macros for the -march and -mtune options.
+   PREFIX is either _LOONGARCH_ARCH or _LOONGARCH_TUNE, INFO is
+   the selected processor.  If INFO's canonical name is "foo",
+   define PREFIX to be "foo", and define an additional macro
+   PREFIX_FOO.  */
+#define LARCH_CPP_SET_PROCESSOR(PREFIX, CPU_TYPE)			\
+  do									\
+    {									\
+      char *macro, *p;							\
+      int cpu_type = (CPU_TYPE);					\
+									\
+      macro = concat ((PREFIX), "_",					\
+		      loongarch_cpu_strings[cpu_type], NULL);		\
+      for (p = macro; *p != 0; p++)					\
+	*p = TOUPPER (*p);						\
+									\
+      builtin_define (macro);						\
+      builtin_define_with_value ((PREFIX),				\
+				 loongarch_cpu_strings[cpu_type], 1);	\
+      free (macro);							\
+    }									\
+  while (0)
+
+void
+loongarch_cpu_cpp_builtins (cpp_reader *pfile)
+{
+  builtin_assert ("machine=loongarch");
+  builtin_assert ("cpu=loongarch");
+  builtin_define ("__loongarch__");
+
+  LARCH_CPP_SET_PROCESSOR ("_LOONGARCH_ARCH", LARCH_ACTUAL_ARCH);
+  LARCH_CPP_SET_PROCESSOR ("_LOONGARCH_TUNE", LARCH_ACTUAL_TUNE);
+
+  /* Base architecture / ABI.  */
+  if (TARGET_64BIT)
+    {
+      builtin_define ("__loongarch_grlen=64");
+      builtin_define ("__loongarch64");
+    }
+
+  if (TARGET_ABI_LP64)
+    {
+      builtin_define ("_ABILP64=3");
+      builtin_define ("_LOONGARCH_SIM=_ABILP64");
+      builtin_define ("__loongarch_lp64");
+    }
+
+  /* These defines reflect the ABI in use, not whether the
+     FPU is directly accessible.  */
+  if (TARGET_DOUBLE_FLOAT_ABI)
+    builtin_define ("__loongarch_double_float=1");
+  else if (TARGET_SINGLE_FLOAT_ABI)
+    builtin_define ("__loongarch_single_float=1");
+
+  if (TARGET_DOUBLE_FLOAT_ABI || TARGET_SINGLE_FLOAT_ABI)
+    builtin_define ("__loongarch_hard_float=1");
+  else
+    builtin_define ("__loongarch_soft_float=1");
+
+
+  /* ISA Extensions.  */
+  if (TARGET_DOUBLE_FLOAT)
+    builtin_define ("__loongarch_frlen=64");
+  else if (TARGET_SINGLE_FLOAT)
+    builtin_define ("__loongarch_frlen=32");
+  else
+    builtin_define ("__loongarch_frlen=0");
+
+  /* Native Data Sizes.  */
+  builtin_define_with_int_value ("_LOONGARCH_SZINT", INT_TYPE_SIZE);
+  builtin_define_with_int_value ("_LOONGARCH_SZLONG", LONG_TYPE_SIZE);
+  builtin_define_with_int_value ("_LOONGARCH_SZPTR", POINTER_SIZE);
+  builtin_define_with_int_value ("_LOONGARCH_FPSET", 32);
+  builtin_define_with_int_value ("_LOONGARCH_SPFPSET", 32);
+
+}
diff --git a/gcc/config/loongarch/loongarch-cpu.cc b/gcc/config/loongarch/loongarch-cpu.cc
new file mode 100644
index 0000000..a886dd9
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-cpu.cc
@@ -0,0 +1,206 @@
+/* Definitions for LoongArch CPU properties.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "diagnostic-core.h"
+
+#include "loongarch-opts.h"
+#include "loongarch-cpu.h"
+#include "loongarch-str.h"
+
+/* Native CPU detection with "cpucfg" */
+#define N_CPUCFG_WORDS 0x15
+static uint32_t cpucfg_cache[N_CPUCFG_WORDS] = { 0 };
+static const int cpucfg_useful_idx[] = {0, 1, 2, 16, 17, 18, 19};
+
+static uint32_t
+read_cpucfg_word (int wordno)
+{
+  /* To make cross-compiler shut up.  */
+  (void) wordno;
+  uint32_t ret = 0;
+
+  #ifdef __loongarch__
+  __asm__ __volatile__ ("cpucfg %0,%1\n\t"
+			:"=r"(ret)
+			:"r"(wordno)
+			:);
+  #endif
+
+  return ret;
+}
+
+void
+cache_cpucfg (void)
+{
+  for (unsigned int i = 0; i < sizeof (cpucfg_useful_idx) / sizeof (int); i++)
+    {
+      cpucfg_cache[cpucfg_useful_idx[i]]
+	= read_cpucfg_word (cpucfg_useful_idx[i]);
+    }
+}
+
+uint32_t
+get_native_prid (void)
+{
+  /* Fill loongarch_cpu_default_config[CPU_NATIVE] with cpucfg data,
+     see "Loongson Architecture Reference Manual"
+     (Volume 1, Section 2.2.10.5) */
+  return cpucfg_cache[0];
+}
+
+const char*
+get_native_prid_str (void)
+{
+  static char prid_str[9];
+  sprintf (prid_str, "%08x", cpucfg_cache[0]);
+  return (const char*) prid_str;
+}
+
+/* Fill property tables for CPU_NATIVE.  */
+unsigned int
+fill_native_cpu_config (int p_arch_native, int p_tune_native)
+{
+  int ret_cpu_type;
+
+  /* Nothing needs to be done unless "-march/tune=native"
+     is given or implied.  */
+  if (!(p_arch_native || p_tune_native))
+    return CPU_NATIVE;
+
+  /* Fill cpucfg_cache with the "cpucfg" instruction.  */
+  cache_cpucfg ();
+
+
+  /* Fill: loongarch_cpu_default_isa[CPU_NATIVE].base
+     With: base architecture (ARCH)
+     At:   cpucfg_words[1][1:0] */
+
+  #define NATIVE_BASE_ISA (loongarch_cpu_default_isa[CPU_NATIVE].base)
+  switch (cpucfg_cache[1] & 0x3)
+    {
+      case 0x02:
+	NATIVE_BASE_ISA = ISA_BASE_LA64V100;
+	break;
+
+      default:
+	if (p_arch_native)
+	  fatal_error (UNKNOWN_LOCATION,
+		       "unknown base architecture %<0x%x%>, %qs failed",
+		       (unsigned int) (cpucfg_cache[1] & 0x3),
+		       "-m" OPTSTR_ARCH "=" STR_CPU_NATIVE);
+    }
+
+  /* Fill: loongarch_cpu_default_isa[CPU_NATIVE].fpu
+     With: FPU type (FP, FP_SP, FP_DP)
+     At:   cpucfg_words[2][2:0] */
+
+  #define NATIVE_FPU (loongarch_cpu_default_isa[CPU_NATIVE].fpu)
+  switch (cpucfg_cache[2] & 0x7)
+    {
+      case 0x07:
+	NATIVE_FPU = ISA_EXT_FPU64;
+	break;
+
+      case 0x03:
+	NATIVE_FPU = ISA_EXT_FPU32;
+	break;
+
+      case 0x00:
+	NATIVE_FPU = ISA_EXT_NOFPU;
+	break;
+
+      default:
+	if (p_arch_native)
+	  fatal_error (UNKNOWN_LOCATION,
+		       "unknown FPU type %<0x%x%>, %qs failed",
+		       (unsigned int) (cpucfg_cache[2] & 0x7),
+		       "-m" OPTSTR_ARCH "=" STR_CPU_NATIVE);
+    }
+
+  /* Fill: loongarch_cpu_cache[CPU_NATIVE]
+     With: cache size info
+     At:   cpucfg_words[16:20][31:0] */
+
+  int l1d_present = 0, l1u_present = 0;
+  int l2d_present = 0;
+  uint32_t l1_szword, l2_szword;
+
+  l1u_present |= cpucfg_cache[16] & 3;	      /* bit[1:0]: unified l1 cache */
+  l1d_present |= cpucfg_cache[16] & 4;	      /* bit[2:2]: l1 dcache */
+  l1_szword = l1d_present ? 18 : (l1u_present ? 17 : 0);
+  l1_szword = l1_szword ? cpucfg_cache[l1_szword]: 0;
+
+  l2d_present |= cpucfg_cache[16] & 24;	      /* bit[4:3]: unified l2 cache */
+  l2d_present |= cpucfg_cache[16] & 128;      /* bit[7:7]: l2 dcache */
+  l2_szword = l2d_present ? cpucfg_cache[19]: 0;
+
+  loongarch_cpu_cache[CPU_NATIVE].l1d_line_size
+    = 1 << ((l1_szword & 0x7f000000) >> 24);  /* bit[30:24]: log2(linesize) */
+
+  loongarch_cpu_cache[CPU_NATIVE].l1d_size
+    = (1 << ((l1_szword & 0x00ff0000) >> 16)) /* bit[23:16]: log2(idx) */
+    * ((l1_szword & 0x0000ffff) + 1)	      /* bit[15:0]:  sets - 1 */
+    * (1 << ((l1_szword & 0x7f000000) >> 24)) /* bit[30:24]: log2(linesize) */
+    >> 10;				      /* in kilobytes */
+
+  loongarch_cpu_cache[CPU_NATIVE].l2d_size
+    = (1 << ((l2_szword & 0x00ff0000) >> 16)) /* bit[23:16]: log2(idx) */
+    * ((l2_szword & 0x0000ffff) + 1)	      /* bit[15:0]:  sets - 1 */
+    * (1 << ((l2_szword & 0x7f000000) >> 24)) /* bit[30:24]: log2(linesize) */
+    >> 10;				      /* in kilobytes */
+
+  /* Fill: ret_cpu_type
+     With: processor ID (PRID)
+     At:   cpucfg_words[0][31:0] */
+
+  switch (cpucfg_cache[0] & 0x00ffff00)
+  {
+    case 0x0014c000:   /* LA464 */
+      ret_cpu_type = CPU_LA464;
+      break;
+
+    default:
+      /* Unknown PRID.  This is generally harmless as long as
+	 the properties above can be obtained via "cpucfg".  */
+      if (p_tune_native)
+	inform (UNKNOWN_LOCATION, "unknown processor ID %<0x%x%>, "
+		"some tuning parameters will fall back to default",
+		cpucfg_cache[0]);
+      break;
+  }
+
+  /* Properties that cannot be looked up directly using cpucfg.  */
+  loongarch_cpu_issue_rate[CPU_NATIVE]
+    = loongarch_cpu_issue_rate[ret_cpu_type];
+
+  loongarch_cpu_multipass_dfa_lookahead[CPU_NATIVE]
+    = loongarch_cpu_multipass_dfa_lookahead[ret_cpu_type];
+
+  loongarch_cpu_rtx_cost_data[CPU_NATIVE]
+    = loongarch_cpu_rtx_cost_data[ret_cpu_type];
+
+  return ret_cpu_type;
+}
diff --git a/gcc/config/loongarch/loongarch-cpu.h b/gcc/config/loongarch/loongarch-cpu.h
new file mode 100644
index 0000000..93d656f
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-cpu.h
@@ -0,0 +1,30 @@
+/* Definitions for loongarch native cpu property detection routines.
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef LOONGARCH_CPU_H
+#define LOONGARCH_CPU_H
+
+#include "system.h"
+
+void cache_cpucfg (void);
+unsigned int fill_native_cpu_config (int p_arch_native, int p_tune_native);
+uint32_t get_native_prid (void);
+const char* get_native_prid_str (void);
+
+#endif /* LOONGARCH_CPU_H */
diff --git a/gcc/config/loongarch/loongarch-def.c b/gcc/config/loongarch/loongarch-def.c
new file mode 100644
index 0000000..c8769b7
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-def.c
@@ -0,0 +1,179 @@
+/* LoongArch static properties.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "loongarch-def.h"
+#include "loongarch-str.h"
+
+/* Default RTX cost initializer.  */
+#define COSTS_N_INSNS(N) ((N) * 4)
+#define DEFAULT_COSTS				\
+    .fp_add		= COSTS_N_INSNS (1),	\
+    .fp_mult_sf		= COSTS_N_INSNS (2),	\
+    .fp_mult_df		= COSTS_N_INSNS (4),	\
+    .fp_div_sf		= COSTS_N_INSNS (6),	\
+    .fp_div_df		= COSTS_N_INSNS (8),	\
+    .int_mult_si	= COSTS_N_INSNS (1),	\
+    .int_mult_di	= COSTS_N_INSNS (1),	\
+    .int_div_si		= COSTS_N_INSNS (4),	\
+    .int_div_di		= COSTS_N_INSNS (6),	\
+    .branch_cost	= 2,			\
+    .memory_latency	= 4
+
+/* CPU property tables.  */
+const char*
+loongarch_cpu_strings[N_TUNE_TYPES] = {
+  [CPU_NATIVE]		  = STR_CPU_NATIVE,
+  [CPU_LOONGARCH64]	  = STR_CPU_LOONGARCH64,
+  [CPU_LA464]		  = STR_CPU_LA464,
+};
+
+struct loongarch_isa
+loongarch_cpu_default_isa[N_ARCH_TYPES] = {
+  [CPU_LOONGARCH64] = {
+      .base = ISA_BASE_LA64V100,
+      .fpu = ISA_EXT_FPU64,
+  },
+  [CPU_LA464] = {
+      .base = ISA_BASE_LA64V100,
+      .fpu = ISA_EXT_FPU64,
+  },
+};
+
+struct loongarch_cache
+loongarch_cpu_cache[N_TUNE_TYPES] = {
+  [CPU_LOONGARCH64] = {
+      .l1d_line_size = 64,
+      .l1d_size = 64,
+      .l2d_size = 256,
+  },
+  [CPU_LA464] = {
+      .l1d_line_size = 64,
+      .l1d_size = 64,
+      .l2d_size = 256,
+  },
+};
+
+/* The following properties cannot be looked up directly using "cpucfg".
+ So it is necessary to provide a default value for "unknown native"
+ tune targets (i.e. -mtune=native while PRID does not correspond to
+ any known "-mtune" type).  */
+
+struct loongarch_rtx_cost_data
+loongarch_cpu_rtx_cost_data[N_TUNE_TYPES] = {
+  [CPU_NATIVE] = {
+      DEFAULT_COSTS
+  },
+  [CPU_LOONGARCH64] = {
+      DEFAULT_COSTS
+  },
+  [CPU_LA464] = {
+      DEFAULT_COSTS
+  },
+};
+
+/* RTX costs to use when optimizing for size.  */
+extern const struct loongarch_rtx_cost_data
+loongarch_rtx_cost_optimize_size = {
+    .fp_add	      = 4,
+    .fp_mult_sf	      = 4,
+    .fp_mult_df	      = 4,
+    .fp_div_sf	      = 4,
+    .fp_div_df	      = 4,
+    .int_mult_si      = 4,
+    .int_mult_di      = 4,
+    .int_div_si	      = 4,
+    .int_div_di	      = 4,
+    .branch_cost      = 2,
+    .memory_latency   = 4,
+};
+
+int
+loongarch_cpu_issue_rate[N_TUNE_TYPES] = {
+  [CPU_NATIVE]	      = 4,
+  [CPU_LOONGARCH64]   = 4,
+  [CPU_LA464]	      = 4,
+};
+
+int
+loongarch_cpu_multipass_dfa_lookahead[N_TUNE_TYPES] = {
+  [CPU_NATIVE]	      = 4,
+  [CPU_LOONGARCH64]   = 4,
+  [CPU_LA464]	      = 4,
+};
+
+/* Wiring string definitions from loongarch-str.h to global arrays
+   with standard index values from loongarch-opts.h, so we can
+   print config-related messages and do ABI self-spec filtering
+   from the driver in a self-consistent manner.  */
+
+const char*
+loongarch_isa_base_strings[N_ISA_BASE_TYPES] = {
+  [ISA_BASE_LA64V100] = STR_ISA_BASE_LA64V100,
+};
+
+const char*
+loongarch_isa_ext_strings[N_ISA_EXT_TYPES] = {
+  [ISA_EXT_FPU64] = STR_ISA_EXT_FPU64,
+  [ISA_EXT_FPU32] = STR_ISA_EXT_FPU32,
+  [ISA_EXT_NOFPU] = STR_ISA_EXT_NOFPU,
+};
+
+const char*
+loongarch_abi_base_strings[N_ABI_BASE_TYPES] = {
+  [ABI_BASE_LP64D] = STR_ABI_BASE_LP64D,
+  [ABI_BASE_LP64F] = STR_ABI_BASE_LP64F,
+  [ABI_BASE_LP64S] = STR_ABI_BASE_LP64S,
+};
+
+const char*
+loongarch_abi_ext_strings[N_ABI_EXT_TYPES] = {
+  [ABI_EXT_BASE] = STR_ABI_EXT_BASE,
+};
+
+const char*
+loongarch_cmodel_strings[] = {
+  [CMODEL_NORMAL]	  = STR_CMODEL_NORMAL,
+  [CMODEL_TINY]		  = STR_CMODEL_TINY,
+  [CMODEL_TINY_STATIC]	  = STR_CMODEL_TS,
+  [CMODEL_LARGE]	  = STR_CMODEL_LARGE,
+  [CMODEL_EXTREME]	  = STR_CMODEL_EXTREME,
+};
+
+const char*
+loongarch_switch_strings[] = {
+  [SW_SOFT_FLOAT]	  = OPTSTR_SOFT_FLOAT,
+  [SW_SINGLE_FLOAT]	  = OPTSTR_SINGLE_FLOAT,
+  [SW_DOUBLE_FLOAT]	  = OPTSTR_DOUBLE_FLOAT,
+};
+
+
+/* ABI-related definitions.  */
+const struct loongarch_isa
+abi_minimal_isa[N_ABI_BASE_TYPES][N_ABI_EXT_TYPES] = {
+  [ABI_BASE_LP64D] = {
+      [ABI_EXT_BASE] = {.base = ISA_BASE_LA64V100, .fpu = ISA_EXT_FPU64},
+  },
+  [ABI_BASE_LP64F] = {
+      [ABI_EXT_BASE] = {.base = ISA_BASE_LA64V100, .fpu = ISA_EXT_FPU32},
+  },
+  [ABI_BASE_LP64S] = {
+      [ABI_EXT_BASE] = {.base = ISA_BASE_LA64V100, .fpu = ISA_EXT_NOFPU},
+  },
+};
diff --git a/gcc/config/loongarch/loongarch-def.h b/gcc/config/loongarch/loongarch-def.h
new file mode 100644
index 0000000..c2c35b6
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-def.h
@@ -0,0 +1,151 @@
+/* LoongArch definitions.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Definition of standard codes for:
+    - base architecture types	(isa_base),
+    - ISA extensions		(isa_ext),
+    - base ABI types		(abi_base),
+    - ABI extension types	(abi_ext).
+
+    - code models		      (cmodel)
+    - other command-line switches     (switch)
+
+   These values are primarily used for implementing option handling
+   logic in "loongarch.opt", "loongarch-driver.c" and "loongarch-opt.c".
+
+   As for the result of this option handling process, the following
+   scheme is adopted to represent the final configuration:
+
+    - The target ABI is encoded with a tuple (abi_base, abi_ext)
+      using the code defined below.
+
+    - The target ISA is encoded with a "struct loongarch_isa" defined
+      in loongarch-cpu.h.
+
+    - The target microarchitecture is represented with a cpu model
+      index defined in loongarch-cpu.h.
+*/
+
+#ifndef LOONGARCH_DEF_H
+#define LOONGARCH_DEF_H
+
+#include "loongarch-tune.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* enum isa_base */
+extern const char* loongarch_isa_base_strings[];
+#define ISA_BASE_LA64V100     0
+#define N_ISA_BASE_TYPES      1
+
+/* enum isa_ext_* */
+extern const char* loongarch_isa_ext_strings[];
+#define ISA_EXT_NOFPU	      0
+#define ISA_EXT_FPU32	      1
+#define ISA_EXT_FPU64	      2
+#define N_ISA_EXT_FPU_TYPES   3
+#define N_ISA_EXT_TYPES	      3
+
+/* enum abi_base */
+extern const char* loongarch_abi_base_strings[];
+#define ABI_BASE_LP64D	      0
+#define ABI_BASE_LP64F	      1
+#define ABI_BASE_LP64S	      2
+#define N_ABI_BASE_TYPES      3
+
+/* enum abi_ext */
+extern const char* loongarch_abi_ext_strings[];
+#define ABI_EXT_BASE	      0
+#define N_ABI_EXT_TYPES	      1
+
+/* enum cmodel */
+extern const char* loongarch_cmodel_strings[];
+#define CMODEL_NORMAL	      0
+#define CMODEL_TINY	      1
+#define CMODEL_TINY_STATIC    2
+#define CMODEL_LARGE	      3
+#define CMODEL_EXTREME	      4
+#define N_CMODEL_TYPES	      5
+
+/* enum switches */
+/* The "SW_" codes represent command-line switches (options that
+   accept no parameters). Definition for other switches that affects
+   the target ISA / ABI configuration will also be appended here
+   in the future.  */
+
+extern const char* loongarch_switch_strings[];
+#define SW_SOFT_FLOAT	      0
+#define SW_SINGLE_FLOAT	      1
+#define SW_DOUBLE_FLOAT	      2
+#define N_SWITCH_TYPES	      3
+
+/* The common default value for variables whose assignments
+   are triggered by command-line options.  */
+
+#define M_OPTION_NOT_SEEN -1
+#define M_OPT_ABSENT(opt_enum)  ((opt_enum) == M_OPTION_NOT_SEEN)
+
+
+/* Internal representation of the target.  */
+struct loongarch_isa
+{
+  unsigned char base;	    /* ISA_BASE_ */
+  unsigned char fpu;	    /* ISA_EXT_FPU_ */
+};
+
+struct loongarch_abi
+{
+  unsigned char base;	    /* ABI_BASE_ */
+  unsigned char ext;	    /* ABI_EXT_ */
+};
+
+struct loongarch_target
+{
+  struct loongarch_isa isa;
+  struct loongarch_abi abi;
+  unsigned char cpu_arch;   /* CPU_ */
+  unsigned char cpu_tune;   /* same */
+  unsigned char cpu_native; /* same */
+  unsigned char cmodel;	    /* CMODEL_ */
+};
+
+/* CPU properties.  */
+/* index */
+#define CPU_NATIVE	  0
+#define CPU_LOONGARCH64	  1
+#define CPU_LA464	  2
+#define N_ARCH_TYPES	  3
+#define N_TUNE_TYPES	  3
+
+/* parallel tables.  */
+extern const char* loongarch_cpu_strings[];
+extern struct loongarch_isa loongarch_cpu_default_isa[];
+extern int loongarch_cpu_issue_rate[];
+extern int loongarch_cpu_multipass_dfa_lookahead[];
+
+extern struct loongarch_cache loongarch_cpu_cache[];
+extern struct loongarch_rtx_cost_data loongarch_cpu_rtx_cost_data[];
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* LOONGARCH_DEF_H */
diff --git a/gcc/config/loongarch/loongarch-driver.cc b/gcc/config/loongarch/loongarch-driver.cc
new file mode 100644
index 0000000..0adcc92
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-driver.cc
@@ -0,0 +1,187 @@
+/* Subroutines for the gcc driver.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "obstack.h"
+#include "diagnostic-core.h"
+
+#include "loongarch-opts.h"
+#include "loongarch-driver.h"
+
+static int
+  opt_arch_driver = M_OPTION_NOT_SEEN,
+  opt_tune_driver = M_OPTION_NOT_SEEN,
+  opt_fpu_driver = M_OPTION_NOT_SEEN,
+  opt_abi_base_driver = M_OPTION_NOT_SEEN,
+  opt_abi_ext_driver = M_OPTION_NOT_SEEN,
+  opt_cmodel_driver = M_OPTION_NOT_SEEN;
+
+int opt_switches = 0;
+
+/* This flag is set to 1 if we believe that the user might be avoiding
+   linking (implicitly) against something from the startfile search paths.  */
+static int no_link = 0;
+
+#define LARCH_DRIVER_SET_M_FLAG(OPTS_ARRAY, N_OPTS, FLAG, STR)	\
+  for (int i = 0; i < (N_OPTS); i++)				\
+  {								\
+    if ((OPTS_ARRAY)[i] != 0)					\
+      if (strcmp ((STR), (OPTS_ARRAY)[i]) == 0)			\
+	(FLAG) = i;						\
+  }
+
+/* Use the public obstack from the gcc driver (defined in gcc.c).
+   This is for allocating space for the returned string.  */
+extern struct obstack opts_obstack;
+
+#define APPEND_LTR(S)				      \
+  obstack_grow (&opts_obstack, (const void*) (S),     \
+		sizeof ((S)) / sizeof (char) -1)
+
+#define APPEND_VAL(S) \
+  obstack_grow (&opts_obstack, (const void*) (S), strlen ((S)))
+
+
+const char*
+driver_set_m_flag (int argc, const char **argv)
+{
+  int parm_off = 0;
+
+  if (argc != 1)
+    return "%eset_m_flag requires exactly 1 argument.";
+
+#undef PARM
+#define PARM (argv[0] + parm_off)
+
+/* Note: sizeof (OPTSTR_##NAME) equals the length of "<option>=".  */
+#undef MATCH_OPT
+#define MATCH_OPT(NAME) \
+  (strncmp (argv[0], OPTSTR_##NAME "=", \
+	    (parm_off = sizeof (OPTSTR_##NAME))) == 0)
+
+  if (strcmp (argv[0], "no_link") == 0)
+    {
+      no_link = 1;
+    }
+  else if (MATCH_OPT (ABI_BASE))
+    {
+      LARCH_DRIVER_SET_M_FLAG (
+	loongarch_abi_base_strings, N_ABI_BASE_TYPES,
+	opt_abi_base_driver, PARM)
+    }
+  else if (MATCH_OPT (ISA_EXT_FPU))
+    {
+      LARCH_DRIVER_SET_M_FLAG (loongarch_isa_ext_strings, N_ISA_EXT_FPU_TYPES,
+			       opt_fpu_driver, PARM)
+    }
+  else if (MATCH_OPT (ARCH))
+    {
+      LARCH_DRIVER_SET_M_FLAG (loongarch_cpu_strings, N_ARCH_TYPES,
+			       opt_arch_driver, PARM)
+    }
+  else if (MATCH_OPT (TUNE))
+    {
+      LARCH_DRIVER_SET_M_FLAG (loongarch_cpu_strings, N_TUNE_TYPES,
+			       opt_tune_driver, PARM)
+    }
+  else if (MATCH_OPT (CMODEL))
+    {
+      LARCH_DRIVER_SET_M_FLAG (loongarch_cmodel_strings, N_CMODEL_TYPES,
+			       opt_cmodel_driver, PARM)
+    }
+  else /* switches */
+    {
+      int switch_idx = M_OPTION_NOT_SEEN;
+
+      LARCH_DRIVER_SET_M_FLAG (loongarch_switch_strings, N_SWITCH_TYPES,
+			       switch_idx, argv[0])
+
+      if (switch_idx != M_OPTION_NOT_SEEN)
+	opt_switches |= loongarch_switch_mask[switch_idx];
+    }
+  return "";
+}
+
+const char*
+driver_get_normalized_m_opts (int argc, const char **argv)
+{
+  if (argc != 0)
+    {
+      (void) argv;  /* To make compiler shut up about unused argument.  */
+      return " %eget_normalized_m_opts requires no argument.\n";
+    }
+
+  loongarch_config_target (& la_target,
+			   opt_switches,
+			   opt_arch_driver,
+			   opt_tune_driver,
+			   opt_fpu_driver,
+			   opt_abi_base_driver,
+			   opt_abi_ext_driver,
+			   opt_cmodel_driver,
+			   !no_link /* follow_multilib_list */);
+
+  /* Output normalized option strings.  */
+  obstack_blank (&opts_obstack, 0);
+
+#undef APPEND_LTR
+#define APPEND_LTR(S) \
+  obstack_grow (&opts_obstack, (const void*) (S), \
+		sizeof ((S)) / sizeof (char) -1)
+
+#undef APPEND_VAL
+#define APPEND_VAL(S) \
+  obstack_grow (&opts_obstack, (const void*) (S), strlen ((S)))
+
+#undef APPEND_OPT
+#define APPEND_OPT(NAME) \
+   APPEND_LTR (" %<m" OPTSTR_##NAME "=* " \
+	       " -m" OPTSTR_##NAME "=")
+
+  for (int i = 0; i < N_SWITCH_TYPES; i++)
+    {
+      APPEND_LTR (" %<m");
+      APPEND_VAL (loongarch_switch_strings[i]);
+    }
+
+  APPEND_OPT (ABI_BASE);
+  APPEND_VAL (loongarch_abi_base_strings[la_target.abi.base]);
+
+  APPEND_OPT (ARCH);
+  APPEND_VAL (loongarch_cpu_strings[la_target.cpu_arch]);
+
+  APPEND_OPT (ISA_EXT_FPU);
+  APPEND_VAL (loongarch_isa_ext_strings[la_target.isa.fpu]);
+
+  APPEND_OPT (CMODEL);
+  APPEND_VAL (loongarch_cmodel_strings[la_target.cmodel]);
+
+  APPEND_OPT (TUNE);
+  APPEND_VAL (loongarch_cpu_strings[la_target.cpu_tune]);
+
+  obstack_1grow (&opts_obstack, '\0');
+
+  return XOBFINISH (&opts_obstack, const char *);
+}
diff --git a/gcc/config/loongarch/loongarch-driver.h b/gcc/config/loongarch/loongarch-driver.h
new file mode 100644
index 0000000..2e4a7a9
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-driver.h
@@ -0,0 +1,68 @@
+/* Subroutine headers for the gcc driver.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef LOONGARCH_DRIVER_H
+#define LOONGARCH_DRIVER_H
+
+#include "loongarch-str.h"
+
+extern const char*
+driver_set_m_flag (int argc, const char **argv);
+
+extern const char*
+driver_get_normalized_m_opts (int argc, const char **argv);
+
+#define EXTRA_SPEC_FUNCTIONS \
+  { "set_m_flag", driver_set_m_flag  }, \
+  { "get_normalized_m_opts", driver_get_normalized_m_opts  },
+
+/* Pre-process ABI-related options.  */
+#define LA_SET_PARM_SPEC(NAME) \
+  " %{m" OPTSTR_##NAME  "=*: %:set_m_flag(" OPTSTR_##NAME "=%*)}" \
+
+#define LA_SET_FLAG_SPEC(NAME) \
+  " %{m" OPTSTR_##NAME  ": %:set_m_flag(" OPTSTR_##NAME ")}" \
+
+#define DRIVER_HANDLE_MACHINE_OPTIONS			      \
+  " %{c|S|E|nostdlib: %:set_m_flag(no_link)}"		      \
+  " %{nostartfiles: %{nodefaultlibs: %:set_m_flag(no_link)}}" \
+  LA_SET_PARM_SPEC (ABI_BASE)				      \
+  LA_SET_PARM_SPEC (ARCH)				      \
+  LA_SET_PARM_SPEC (TUNE)				      \
+  LA_SET_PARM_SPEC (ISA_EXT_FPU)			      \
+  LA_SET_PARM_SPEC (CMODEL)				      \
+  LA_SET_FLAG_SPEC (SOFT_FLOAT)				      \
+  LA_SET_FLAG_SPEC (SINGLE_FLOAT)			      \
+  LA_SET_FLAG_SPEC (DOUBLE_FLOAT)			      \
+  " %:get_normalized_m_opts()"
+
+#define DRIVER_SELF_SPECS \
+  DRIVER_HANDLE_MACHINE_OPTIONS
+
+/* ABI spec strings.  */
+#define ABI_GRLEN_SPEC \
+  "%{mabi=lp64*:64}"   \
+
+#define ABI_SPEC \
+  "%{mabi=lp64d:lp64d}" \
+  "%{mabi=lp64f:lp64f}" \
+  "%{mabi=lp64s:lp64s}" \
+
+#endif /* LOONGARCH_DRIVER_H */
diff --git a/gcc/config/loongarch/loongarch-ftypes.def b/gcc/config/loongarch/loongarch-ftypes.def
new file mode 100644
index 0000000..2babff4
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-ftypes.def
@@ -0,0 +1,65 @@
+/* Definitions of prototypes for LoongArch built-in functions.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+   Based on MIPS target for GNU ckompiler.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Invoke DEF_LARCH_FTYPE (NARGS, LIST) for each prototype used by
+   LoongArch built-in functions, where:
+
+      NARGS is the number of arguments.
+      LIST contains the return-type code followed by the codes for each
+      argument type.
+
+   Argument- and return-type codes are either modes or one of the following:
+
+      VOID for void_type_node
+      INT for integer_type_node
+      POINTER for ptr_type_node
+
+   (we don't use PTR because that's a ANSI-compatibillity macro).
+
+   Please keep this list lexicographically sorted by the LIST argument.  */
+
+DEF_LARCH_FTYPE (1, (UQI, USI))
+DEF_LARCH_FTYPE (1, (UHI, USI))
+DEF_LARCH_FTYPE (1, (USI, USI))
+DEF_LARCH_FTYPE (1, (UDI, USI))
+DEF_LARCH_FTYPE (1, (USI, UQI))
+DEF_LARCH_FTYPE (1, (VOID, USI))
+
+DEF_LARCH_FTYPE (2, (VOID, UQI, USI))
+DEF_LARCH_FTYPE (2, (VOID, UHI, USI))
+DEF_LARCH_FTYPE (2, (VOID, USI, USI))
+DEF_LARCH_FTYPE (2, (VOID, UDI, USI))
+DEF_LARCH_FTYPE (2, (VOID, DI, UQI))
+DEF_LARCH_FTYPE (2, (VOID, SI, UQI))
+DEF_LARCH_FTYPE (2, (VOID, DI, DI))
+DEF_LARCH_FTYPE (2, (SI, SI, UQI))
+DEF_LARCH_FTYPE (2, (DI, DI, UQI))
+DEF_LARCH_FTYPE (2, (SI, QI, SI))
+DEF_LARCH_FTYPE (2, (SI, HI, SI))
+DEF_LARCH_FTYPE (2, (SI, SI, SI))
+DEF_LARCH_FTYPE (2, (SI, DI, SI))
+DEF_LARCH_FTYPE (2, (USI, USI, USI))
+DEF_LARCH_FTYPE (2, (UDI, UDI, USI))
+
+DEF_LARCH_FTYPE (3, (VOID, USI, USI, SI))
+DEF_LARCH_FTYPE (3, (VOID, USI, UDI, SI))
+DEF_LARCH_FTYPE (3, (USI, USI, USI, USI))
+DEF_LARCH_FTYPE (3, (UDI, UDI, UDI, USI))
diff --git a/gcc/config/loongarch/loongarch-modes.def b/gcc/config/loongarch/loongarch-modes.def
new file mode 100644
index 0000000..7f06e2d
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-modes.def
@@ -0,0 +1,25 @@
+/* LoongArch extra machine modes.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+   Based on MIPS target for GNU compiler.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+FLOAT_MODE (TF, 16, ieee_quad_format);
+
+/* For floating point conditions in FCC registers.  */
+CC_MODE (FCC);
diff --git a/gcc/config/loongarch/loongarch-opts.cc b/gcc/config/loongarch/loongarch-opts.cc
new file mode 100644
index 0000000..eb9c2a5
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-opts.cc
@@ -0,0 +1,577 @@
+/* Subroutines for loongarch-specific option handling.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "obstack.h"
+#include "diagnostic-core.h"
+#include "loongarch-cpu.h"
+#include "loongarch-opts.h"
+#include "loongarch-str.h"
+
+struct loongarch_target la_target;
+
+/* ABI-related configuration.  */
+#define ABI_COUNT (sizeof(abi_priority_list)/sizeof(struct loongarch_abi))
+static const struct loongarch_abi
+abi_priority_list[] = {
+    {ABI_BASE_LP64D, ABI_EXT_BASE},
+    {ABI_BASE_LP64F, ABI_EXT_BASE},
+    {ABI_BASE_LP64S, ABI_EXT_BASE},
+};
+
+/* Initialize enabled_abi_types from TM_MULTILIB_LIST.  */
+#ifdef LA_DISABLE_MULTILIB
+#define MULTILIB_LIST_LEN 1
+#else
+#define MULTILIB_LIST_LEN (sizeof (tm_multilib_list) / sizeof (int) / 2)
+static const int tm_multilib_list[] = { TM_MULTILIB_LIST };
+#endif
+static int enabled_abi_types[N_ABI_BASE_TYPES][N_ABI_EXT_TYPES] = { 0 };
+
+#define isa_required(ABI) (abi_minimal_isa[(ABI).base][(ABI).ext])
+extern "C" const struct loongarch_isa
+abi_minimal_isa[N_ABI_BASE_TYPES][N_ABI_EXT_TYPES];
+
+static inline int
+is_multilib_enabled (struct loongarch_abi abi)
+{
+  return enabled_abi_types[abi.base][abi.ext];
+}
+
+static void
+init_enabled_abi_types ()
+{
+#ifdef LA_DISABLE_MULTILIB
+  enabled_abi_types[DEFAULT_ABI_BASE][DEFAULT_ABI_EXT] = 1;
+#else
+  int abi_base, abi_ext;
+  for (unsigned int i = 0; i < MULTILIB_LIST_LEN; i++)
+    {
+      abi_base = tm_multilib_list[i << 1];
+      abi_ext = tm_multilib_list[(i << 1) + 1];
+      enabled_abi_types[abi_base][abi_ext] = 1;
+    }
+#endif
+}
+
+/* Switch masks.  */
+#undef M
+#define M(NAME) OPTION_MASK_##NAME
+const int loongarch_switch_mask[N_SWITCH_TYPES] = {
+  /* SW_SOFT_FLOAT */    M(FORCE_SOFTF),
+  /* SW_SINGLE_FLOAT */  M(FORCE_F32),
+  /* SW_DOUBLE_FLOAT */  M(FORCE_F64),
+};
+#undef M
+
+/* String processing.  */
+static struct obstack msg_obstack;
+#define APPEND_STRING(STR) obstack_grow (&msg_obstack, STR, strlen(STR));
+#define APPEND1(CH) obstack_1grow(&msg_obstack, CH);
+
+static const char* abi_str (struct loongarch_abi abi);
+static const char* isa_str (const struct loongarch_isa *isa, char separator);
+static const char* arch_str (const struct loongarch_target *target);
+static const char* multilib_enabled_abi_list ();
+
+/* Misc */
+static struct loongarch_abi isa_default_abi (const struct loongarch_isa *isa);
+static int isa_base_compat_p (const struct loongarch_isa *set1,
+			      const struct loongarch_isa *set2);
+static int isa_fpu_compat_p (const struct loongarch_isa *set1,
+			     const struct loongarch_isa *set2);
+static int abi_compat_p (const struct loongarch_isa *isa,
+			 struct loongarch_abi abi);
+static int abi_default_cpu_arch (struct loongarch_abi abi);
+
+/* Checking configure-time defaults.  */
+#ifndef DEFAULT_ABI_BASE
+#error missing definition of DEFAULT_ABI_BASE in ${tm_defines}.
+#endif
+
+#ifndef DEFAULT_ABI_EXT
+#error missing definition of DEFAULT_ABI_EXT in ${tm_defines}.
+#endif
+
+#ifndef DEFAULT_CPU_ARCH
+#error missing definition of DEFAULT_CPU_ARCH in ${tm_defines}.
+#endif
+
+#ifndef DEFAULT_ISA_EXT_FPU
+#error missing definition of DEFAULT_ISA_EXT_FPU in ${tm_defines}.
+#endif
+
+/* Handle combinations of -m machine option values
+   (see loongarch.opt and loongarch-opts.h).  */
+void
+loongarch_config_target (struct loongarch_target *target,
+			 HOST_WIDE_INT opt_switches,
+			 int opt_arch, int opt_tune, int opt_fpu,
+			 int opt_abi_base, int opt_abi_ext,
+			 int opt_cmodel, int follow_multilib_list)
+{
+  struct loongarch_target t;
+
+  if (!target)
+    return;
+
+  /* Initialization */
+  init_enabled_abi_types ();
+  obstack_init (&msg_obstack);
+
+  struct {
+    int arch, tune, fpu, abi_base, abi_ext, cmodel;
+  } constrained = {
+      M_OPT_ABSENT(opt_arch)     ? 0 : 1,
+      M_OPT_ABSENT(opt_tune)     ? 0 : 1,
+      M_OPT_ABSENT(opt_fpu)      ? 0 : 1,
+      M_OPT_ABSENT(opt_abi_base) ? 0 : 1,
+      M_OPT_ABSENT(opt_abi_ext)  ? 0 : 1,
+      M_OPT_ABSENT(opt_cmodel)   ? 0 : 1,
+  };
+
+#define on(NAME) ((loongarch_switch_mask[(SW_##NAME)] & opt_switches) \
+		  && (on_switch = (SW_##NAME), 1))
+  int on_switch;
+
+  /* 1.  Target ABI */
+  t.abi.base = constrained.abi_base ? opt_abi_base : DEFAULT_ABI_BASE;
+
+  t.abi.ext = constrained.abi_ext ? opt_abi_ext : DEFAULT_ABI_EXT;
+
+  /* Extra switch handling.  */
+  if (on (SOFT_FLOAT) || on (SINGLE_FLOAT) || on (DOUBLE_FLOAT))
+    {
+      switch (on_switch)
+	{
+	  case SW_SOFT_FLOAT:
+	    opt_fpu = ISA_EXT_NOFPU;
+	    break;
+
+	  case SW_SINGLE_FLOAT:
+	    opt_fpu = ISA_EXT_FPU32;
+	    break;
+
+	  case SW_DOUBLE_FLOAT:
+	    opt_fpu = ISA_EXT_FPU64;
+	    break;
+
+	  default:
+	    gcc_unreachable();
+	}
+      constrained.fpu = 1;
+
+      /* The target ISA is not ready yet, but (isa_required (t.abi)
+	 + forced fpu) is enough for computing the forced base ABI.  */
+      struct loongarch_isa default_isa = isa_required (t.abi);
+      struct loongarch_isa force_isa = default_isa;
+      struct loongarch_abi force_abi = t.abi;
+      force_isa.fpu = opt_fpu;
+      force_abi.base = isa_default_abi (&force_isa).base;
+
+      if (constrained.abi_base && (t.abi.base != force_abi.base))
+	inform (UNKNOWN_LOCATION,
+		"%<-m%s%> overrides %<-m%s=%s%>, adjusting ABI to %qs",
+		loongarch_switch_strings[on_switch],
+		OPTSTR_ABI_BASE, loongarch_abi_base_strings[t.abi.base],
+		abi_str (force_abi));
+
+      t.abi.base = force_abi.base;
+    }
+
+#ifdef LA_DISABLE_MULTILIB
+  if (follow_multilib_list)
+    if (t.abi.base != DEFAULT_ABI_BASE || t.abi.ext != DEFAULT_ABI_EXT)
+      {
+	static const struct loongarch_abi default_abi
+	  = {DEFAULT_ABI_BASE, DEFAULT_ABI_EXT};
+
+	warning (0, "ABI changed (%qs to %qs) while multilib is disabled",
+		 abi_str (default_abi), abi_str (t.abi));
+      }
+#endif
+
+  /* 2.  Target CPU */
+  t.cpu_arch = constrained.arch ? opt_arch : DEFAULT_CPU_ARCH;
+
+  t.cpu_tune = constrained.tune ? opt_tune
+    : (constrained.arch ? DEFAULT_CPU_ARCH : DEFAULT_CPU_TUNE);
+
+#ifdef __loongarch__
+  /* For native compilers, gather local CPU information
+     and fill the "CPU_NATIVE" index of arrays defined in
+     loongarch-cpu.c.  */
+
+  t.cpu_native = fill_native_cpu_config (t.cpu_arch == CPU_NATIVE,
+					 t.cpu_tune == CPU_NATIVE);
+
+#else
+  if (t.cpu_arch == CPU_NATIVE)
+    fatal_error (UNKNOWN_LOCATION,
+		 "%qs does not work on a cross compiler",
+		 "-m" OPTSTR_ARCH "=" STR_CPU_NATIVE);
+
+  else if (t.cpu_tune == CPU_NATIVE)
+    fatal_error (UNKNOWN_LOCATION,
+		 "%qs does not work on a cross compiler",
+		 "-m" OPTSTR_TUNE "=" STR_CPU_NATIVE);
+#endif
+
+  /* 3.  Target ISA */
+config_target_isa:
+
+  /* Get default ISA from "-march" or its default value.  */
+  t.isa = loongarch_cpu_default_isa[LARCH_ACTUAL_ARCH];
+
+  /* Apply incremental changes.  */
+  /* "-march=native" overrides the default FPU type.  */
+  t.isa.fpu = constrained.fpu ? opt_fpu :
+    ((t.cpu_arch == CPU_NATIVE && constrained.arch) ?
+     t.isa.fpu : DEFAULT_ISA_EXT_FPU);
+
+
+  /* 4.  ABI-ISA compatibility */
+  /* Note:
+     - There IS a unique default -march value for each ABI type
+       (config.gcc: triplet -> abi -> default arch).
+
+     - If the base ABI is incompatible with the default arch,
+       try using the default -march it implies (and mark it
+       as "constrained" this time), then re-apply step 3.  */
+
+  struct loongarch_abi abi_tmp;
+  const struct loongarch_isa* isa_min;
+
+  abi_tmp = t.abi;
+  isa_min = &isa_required (abi_tmp);
+
+  if (isa_base_compat_p (&t.isa, isa_min)); /* OK.  */
+  else if (!constrained.arch)
+    {
+      /* Base architecture can only be implied by -march,
+	 so we adjust that first if it is not constrained.  */
+      int fallback_arch = abi_default_cpu_arch (t.abi);
+
+      if (t.cpu_arch == CPU_NATIVE)
+	warning (0, "your native CPU architecture (%qs) "
+		 "does not support %qs ABI, falling back to %<-m%s=%s%>",
+		 arch_str (&t), abi_str (t.abi), OPTSTR_ARCH,
+		 loongarch_cpu_strings[fallback_arch]);
+      else
+	warning (0, "default CPU architecture (%qs) "
+		 "does not support %qs ABI, falling back to %<-m%s=%s%>",
+		 arch_str (&t), abi_str (t.abi), OPTSTR_ARCH,
+		 loongarch_cpu_strings[fallback_arch]);
+
+      t.cpu_arch = fallback_arch;
+      constrained.arch = 1;
+      goto config_target_isa;
+    }
+  else if (!constrained.abi_base)
+    {
+      /* If -march is given while -mabi is not,
+	 try selecting another base ABI type.  */
+      abi_tmp.base = isa_default_abi (&t.isa).base;
+    }
+  else
+    goto fatal;
+
+  if (isa_fpu_compat_p (&t.isa, isa_min)); /* OK.  */
+  else if (!constrained.fpu)
+    t.isa.fpu = isa_min->fpu;
+  else if (!constrained.abi_base)
+    /* If -march is compatible with the default ABI
+       while -mfpu is not.  */
+    abi_tmp.base = isa_default_abi (&t.isa).base;
+  else
+    goto fatal;
+
+  if (0)
+fatal:
+    fatal_error (UNKNOWN_LOCATION,
+		 "unable to implement ABI %qs with instruction set %qs",
+		 abi_str (t.abi), isa_str (&t.isa, '/'));
+
+
+  /* Using the fallback ABI.  */
+  if (abi_tmp.base != t.abi.base || abi_tmp.ext != t.abi.ext)
+    {
+      /* This flag is only set in the GCC driver.  */
+      if (follow_multilib_list)
+	{
+
+	  /* Continue falling back until we find a feasible ABI type
+	     enabled by TM_MULTILIB_LIST.  */
+	  if (!is_multilib_enabled (abi_tmp))
+	    {
+	      for (unsigned int i = 0; i < ABI_COUNT; i++)
+		{
+		  if (is_multilib_enabled (abi_priority_list[i])
+		      && abi_compat_p (&t.isa, abi_priority_list[i]))
+		    {
+		      abi_tmp = abi_priority_list[i];
+
+		      warning (0, "ABI %qs cannot be implemented due to "
+			       "limited instruction set %qs, "
+			       "falling back to %qs", abi_str (t.abi),
+			       isa_str (&t.isa, '/'), abi_str (abi_tmp));
+
+		      goto fallback;
+		    }
+		}
+
+	      /* Otherwise, keep using abi_tmp with a warning.  */
+#ifdef LA_DISABLE_MULTILIB
+	      warning (0, "instruction set %qs cannot implement "
+		       "default ABI %qs, falling back to %qs",
+		       isa_str (&t.isa, '/'), abi_str (t.abi),
+		       abi_str (abi_tmp));
+#else
+	      warning (0, "no multilib-enabled ABI (%qs) can be implemented "
+		       "with instruction set %qs, falling back to %qs",
+		       multilib_enabled_abi_list (),
+		       isa_str (&t.isa, '/'), abi_str (abi_tmp));
+#endif
+	    }
+	}
+
+fallback:
+      t.abi = abi_tmp;
+    }
+  else if (follow_multilib_list)
+    {
+      if (!is_multilib_enabled (t.abi))
+	{
+	  inform (UNKNOWN_LOCATION,
+		  "ABI %qs is not enabled at configure-time, "
+		  "the linker might report an error", abi_str (t.abi));
+
+	  inform (UNKNOWN_LOCATION, "ABI with startfiles: %s",
+		  multilib_enabled_abi_list ());
+	}
+    }
+
+
+  /* 5.  Target code model */
+  t.cmodel = constrained.cmodel ? opt_cmodel : CMODEL_NORMAL;
+
+  /* Cleanup and return.  */
+  obstack_free (&msg_obstack, NULL);
+  *target = t;
+}
+
+/* Returns the default ABI for the given instruction set.  */
+static inline struct loongarch_abi
+isa_default_abi (const struct loongarch_isa *isa)
+{
+  struct loongarch_abi abi;
+
+  switch (isa->fpu)
+    {
+      case ISA_EXT_FPU64:
+	if (isa->base == ISA_BASE_LA64V100)
+	  abi.base = ABI_BASE_LP64D;
+	break;
+
+      case ISA_EXT_FPU32:
+	if (isa->base == ISA_BASE_LA64V100)
+	  abi.base = ABI_BASE_LP64F;
+	break;
+
+      case ISA_EXT_NOFPU:
+	if (isa->base == ISA_BASE_LA64V100)
+	  abi.base = ABI_BASE_LP64S;
+	break;
+
+      default:
+	gcc_unreachable ();
+    }
+
+  abi.ext = ABI_EXT_BASE;
+  return abi;
+}
+
+/* Check if set2 is a subset of set1.  */
+static inline int
+isa_base_compat_p (const struct loongarch_isa *set1,
+		   const struct loongarch_isa *set2)
+{
+  switch (set2->base)
+    {
+      case ISA_BASE_LA64V100:
+	return (set1->base == ISA_BASE_LA64V100);
+
+      default:
+	gcc_unreachable ();
+    }
+}
+
+static inline int
+isa_fpu_compat_p (const struct loongarch_isa *set1,
+		  const struct loongarch_isa *set2)
+{
+  switch (set2->fpu)
+    {
+      case ISA_EXT_FPU64:
+	return set1->fpu == ISA_EXT_FPU64;
+
+      case ISA_EXT_FPU32:
+	return set1->fpu == ISA_EXT_FPU32 || set1->fpu == ISA_EXT_FPU64;
+
+      case ISA_EXT_NOFPU:
+	return 1;
+
+      default:
+	gcc_unreachable ();
+    }
+
+}
+
+static inline int
+abi_compat_p (const struct loongarch_isa *isa, struct loongarch_abi abi)
+{
+  int compatible = 1;
+  const struct loongarch_isa *isa2 = &isa_required (abi);
+
+  /* Append conditionals for new ISA components below.  */
+  compatible = compatible && isa_base_compat_p (isa, isa2);
+  compatible = compatible && isa_fpu_compat_p (isa, isa2);
+  return compatible;
+}
+
+/* The behavior of this function should be consistent
+   with config.gcc.  */
+static inline int
+abi_default_cpu_arch (struct loongarch_abi abi)
+{
+  switch (abi.base)
+    {
+      case ABI_BASE_LP64D:
+      case ABI_BASE_LP64F:
+      case ABI_BASE_LP64S:
+	if (abi.ext == ABI_EXT_BASE)
+	  return CPU_LOONGARCH64;
+    }
+  gcc_unreachable ();
+}
+
+static const char*
+abi_str (struct loongarch_abi abi)
+{
+  /* "/base" can be omitted.  */
+  if (abi.ext == ABI_EXT_BASE)
+    return (const char*)
+      obstack_copy0 (&msg_obstack, loongarch_abi_base_strings[abi.base],
+		     strlen (loongarch_abi_base_strings[abi.base]));
+  else
+    {
+      APPEND_STRING (loongarch_abi_base_strings[abi.base])
+      APPEND1 ('/')
+      APPEND_STRING (loongarch_abi_ext_strings[abi.ext])
+      APPEND1 ('\0')
+
+      return XOBFINISH (&msg_obstack, const char *);
+    }
+}
+
+static const char*
+isa_str (const struct loongarch_isa *isa, char separator)
+{
+  APPEND_STRING (loongarch_isa_base_strings[isa->base])
+  APPEND1 (separator)
+
+  if (isa->fpu == ISA_EXT_NOFPU)
+    {
+      APPEND_STRING ("no" OPTSTR_ISA_EXT_FPU)
+    }
+  else
+    {
+      APPEND_STRING (OPTSTR_ISA_EXT_FPU)
+      APPEND_STRING (loongarch_isa_ext_strings[isa->fpu])
+    }
+  APPEND1 ('\0')
+
+  /* Add more here.  */
+
+  return XOBFINISH (&msg_obstack, const char *);
+}
+
+static const char*
+arch_str (const struct loongarch_target *target)
+{
+  if (target->cpu_arch == CPU_NATIVE)
+    {
+      if (target->cpu_native == CPU_NATIVE)
+	{
+	  /* Describe a native CPU with unknown PRID.  */
+	  const char* isa_string = isa_str (&target->isa, ',');
+	  APPEND_STRING ("PRID: 0x")
+	  APPEND_STRING (get_native_prid_str ())
+	  APPEND_STRING (", ISA features: ")
+	  APPEND_STRING (isa_string)
+	  APPEND1 ('\0')
+	}
+      else
+	APPEND_STRING (loongarch_cpu_strings[target->cpu_native]);
+    }
+  else
+    APPEND_STRING (loongarch_cpu_strings[target->cpu_arch]);
+
+  APPEND1 ('\0')
+  return XOBFINISH (&msg_obstack, const char *);
+}
+
+static const char*
+multilib_enabled_abi_list ()
+{
+  int enabled_abi_idx[MULTILIB_LIST_LEN] = { 0 };
+  const char* enabled_abi_str[MULTILIB_LIST_LEN] = { NULL };
+  unsigned int j = 0;
+
+  for (unsigned int i = 0; i < ABI_COUNT && j < MULTILIB_LIST_LEN; i++)
+    {
+      if (enabled_abi_types[abi_priority_list[i].base]
+	  [abi_priority_list[i].ext])
+	{
+	  enabled_abi_idx[j++] = i;
+	}
+    }
+
+  for (unsigned int k = 0; k < j; k++)
+    {
+      enabled_abi_str[k] = abi_str (abi_priority_list[enabled_abi_idx[k]]);
+    }
+
+  for (unsigned int k = 0; k < j - 1; k++)
+    {
+      APPEND_STRING (enabled_abi_str[k])
+      APPEND1 (',')
+      APPEND1 (' ')
+    }
+  APPEND_STRING (enabled_abi_str[j - 1])
+  APPEND1 ('\0')
+
+  return XOBFINISH (&msg_obstack, const char *);
+}
diff --git a/gcc/config/loongarch/loongarch-opts.h b/gcc/config/loongarch/loongarch-opts.h
new file mode 100644
index 0000000..eaa6fc0
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-opts.h
@@ -0,0 +1,90 @@
+/* Definitions for loongarch-specific option handling.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef LOONGARCH_OPTS_H
+#define LOONGARCH_OPTS_H
+
+
+/* Target configuration */
+extern struct loongarch_target la_target;
+
+/* Switch masks */
+extern const int loongarch_switch_mask[];
+
+#include "loongarch-def.h"
+
+#if !defined(IN_LIBGCC2) && !defined(IN_TARGET_LIBS) && !defined(IN_RTS)
+/* Handler for "-m" option combinations,
+   shared by the driver and the compiler proper.  */
+void
+loongarch_config_target (struct loongarch_target *target,
+			 HOST_WIDE_INT opt_switches,
+			 int opt_arch, int opt_tune, int opt_fpu,
+			 int opt_abi_base, int opt_abi_ext,
+			 int opt_cmodel, int follow_multilib_list);
+#endif
+
+
+/* Macros for common conditional expressions used in loongarch.{c,h,md} */
+#define TARGET_CMODEL_NORMAL	    (la_target.cmodel == CMODEL_NORMAL)
+#define TARGET_CMODEL_TINY	    (la_target.cmodel == CMODEL_TINY)
+#define TARGET_CMODEL_TINY_STATIC   (la_target.cmodel == CMODEL_TINY_STATIC)
+#define TARGET_CMODEL_LARGE	    (la_target.cmodel == CMODEL_LARGE)
+#define TARGET_CMODEL_EXTREME	    (la_target.cmodel == CMODEL_EXTREME)
+
+#define TARGET_HARD_FLOAT	    (la_target.isa.fpu != ISA_EXT_NOFPU)
+#define TARGET_HARD_FLOAT_ABI	    (la_target.abi.base == ABI_BASE_LP64D \
+				     || la_target.abi.base == ABI_BASE_LP64F)
+
+#define TARGET_SOFT_FLOAT	  (la_target.isa.fpu == ISA_EXT_NOFPU)
+#define TARGET_SOFT_FLOAT_ABI	  (la_target.abi.base == ABI_BASE_LP64S)
+#define TARGET_SINGLE_FLOAT	  (la_target.isa.fpu == ISA_EXT_FPU32)
+#define TARGET_SINGLE_FLOAT_ABI	  (la_target.abi.base == ABI_BASE_LP64F)
+#define TARGET_DOUBLE_FLOAT	  (la_target.isa.fpu == ISA_EXT_FPU64)
+#define TARGET_DOUBLE_FLOAT_ABI	  (la_target.abi.base == ABI_BASE_LP64D)
+
+#define TARGET_64BIT		  (la_target.isa.base == ISA_BASE_LA64V100)
+#define TARGET_ABI_LP64		  (la_target.abi.base == ABI_BASE_LP64D	\
+				   || la_target.abi.base == ABI_BASE_LP64F \
+				   || la_target.abi.base == ABI_BASE_LP64S)
+
+#define TARGET_ARCH_NATIVE	  (la_target.cpu_arch == CPU_NATIVE)
+#define LARCH_ACTUAL_ARCH	  (TARGET_ARCH_NATIVE \
+				   ? (la_target.cpu_native < N_ARCH_TYPES \
+				      ? (la_target.cpu_native) : (CPU_NATIVE)) \
+				      : (la_target.cpu_arch))
+
+#define TARGET_TUNE_NATIVE	(la_target.cpu_tune == CPU_NATIVE)
+#define LARCH_ACTUAL_TUNE		(TARGET_TUNE_NATIVE \
+				 ? (la_target.cpu_native < N_TUNE_TYPES \
+				    ? (la_target.cpu_native) : (CPU_NATIVE)) \
+				    : (la_target.cpu_tune))
+
+#define TARGET_ARCH_LOONGARCH64	  (LARCH_ACTUAL_ARCH == CPU_LOONGARCH64)
+#define TARGET_ARCH_LA464	  (LARCH_ACTUAL_ARCH == CPU_LA464)
+
+#define TARGET_TUNE_LOONGARCH64	  (LARCH_ACTUAL_TUNE == CPU_LOONGARCH64)
+#define TARGET_TUNE_LA464	  (LARCH_ACTUAL_TUNE == CPU_LA464)
+
+/* Note: optimize_size may vary across functions,
+   while -m[no]-memcpy imposes a global constraint.  */
+#define TARGET_DO_OPTIMIZE_BLOCK_MOVE_P  loongarch_do_optimize_block_move_p()
+
+#endif /* LOONGARCH_OPTS_H */
diff --git a/gcc/config/loongarch/loongarch-protos.h b/gcc/config/loongarch/loongarch-protos.h
new file mode 100644
index 0000000..2144c24
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-protos.h
@@ -0,0 +1,172 @@
+/* Prototypes of target machine for GNU compiler.  LoongArch version.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+   Based on MIPS target for GNU compiler.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_LOONGARCH_PROTOS_H
+#define GCC_LOONGARCH_PROTOS_H
+
+/* Classifies a SYMBOL_REF, LABEL_REF or UNSPEC address.
+
+   SYMBOL_GOT_DISP
+       The symbol's value will be loaded directly from the GOT.
+
+   SYMBOL_TLS
+       A thread-local symbol.
+
+   SYMBOL_TLSGD
+   SYMBOL_TLSLDM
+       UNSPEC wrappers around SYMBOL_TLS, corresponding to the
+       thread-local storage relocation operators.
+   */
+enum loongarch_symbol_type {
+  SYMBOL_GOT_DISP,
+  SYMBOL_TLS,
+  SYMBOL_TLSGD,
+  SYMBOL_TLSLDM,
+};
+#define NUM_SYMBOL_TYPES (SYMBOL_TLSLDM + 1)
+
+/* Routines implemented in loongarch.c.  */
+extern rtx loongarch_emit_move (rtx, rtx);
+extern HOST_WIDE_INT loongarch_initial_elimination_offset (int, int);
+extern void loongarch_expand_prologue (void);
+extern void loongarch_expand_epilogue (bool);
+extern bool loongarch_can_use_return_insn (void);
+
+extern bool loongarch_symbolic_constant_p (rtx, enum loongarch_symbol_type *);
+extern int loongarch_regno_mode_ok_for_base_p (int, machine_mode, bool);
+extern int loongarch_address_insns (rtx, machine_mode, bool);
+extern int loongarch_const_insns (rtx);
+extern int loongarch_split_const_insns (rtx);
+extern int loongarch_split_128bit_const_insns (rtx);
+extern int loongarch_load_store_insns (rtx, rtx_insn *);
+extern int loongarch_idiv_insns (machine_mode);
+#ifdef RTX_CODE
+extern void loongarch_emit_binary (enum rtx_code, rtx, rtx, rtx);
+#endif
+extern bool loongarch_split_symbol (rtx, rtx, machine_mode, rtx *);
+extern rtx loongarch_unspec_address (rtx, enum loongarch_symbol_type);
+extern rtx loongarch_strip_unspec_address (rtx);
+extern void loongarch_move_integer (rtx, rtx, unsigned HOST_WIDE_INT);
+extern bool loongarch_legitimize_move (machine_mode, rtx, rtx);
+extern rtx loongarch_legitimize_call_address (rtx);
+
+extern rtx loongarch_subword (rtx, bool);
+extern bool loongarch_split_move_p (rtx, rtx);
+extern void loongarch_split_move (rtx, rtx, rtx);
+extern bool loongarch_split_move_insn_p (rtx, rtx);
+extern void loongarch_split_move_insn (rtx, rtx, rtx);
+extern const char *loongarch_output_move (rtx, rtx);
+extern bool loongarch_cfun_has_cprestore_slot_p (void);
+#ifdef RTX_CODE
+extern void loongarch_expand_scc (rtx *);
+extern void loongarch_expand_conditional_branch (rtx *);
+extern void loongarch_expand_conditional_move (rtx *);
+extern void loongarch_expand_conditional_trap (rtx);
+#endif
+extern void loongarch_set_return_address (rtx, rtx);
+extern bool loongarch_move_by_pieces_p (unsigned HOST_WIDE_INT, unsigned int);
+extern bool loongarch_expand_block_move (rtx, rtx, rtx);
+extern bool loongarch_do_optimize_block_move_p (void);
+
+extern bool loongarch_expand_ext_as_unaligned_load (rtx, rtx, HOST_WIDE_INT,
+						    HOST_WIDE_INT, bool);
+extern bool loongarch_expand_ins_as_unaligned_store (rtx, rtx, HOST_WIDE_INT,
+						     HOST_WIDE_INT);
+extern HOST_WIDE_INT loongarch_debugger_offset (rtx, HOST_WIDE_INT);
+
+extern void loongarch_output_external (FILE *, tree, const char *);
+extern void loongarch_output_ascii (FILE *, const char *, size_t);
+extern bool loongarch_small_data_pattern_p (rtx);
+extern rtx loongarch_rewrite_small_data (rtx);
+extern rtx loongarch_return_addr (int, rtx);
+
+extern enum reg_class loongarch_secondary_reload_class (enum reg_class,
+							machine_mode,
+							rtx, bool);
+extern int loongarch_class_max_nregs (enum reg_class, machine_mode);
+
+extern machine_mode loongarch_hard_regno_caller_save_mode (unsigned int,
+							   unsigned int,
+							   machine_mode);
+extern int loongarch_adjust_insn_length (rtx_insn *, int);
+extern const char *loongarch_output_conditional_branch (rtx_insn *, rtx *,
+							const char *,
+							const char *);
+extern const char *loongarch_output_order_conditional_branch (rtx_insn *,
+							      rtx *,
+							      bool);
+extern const char *loongarch_output_equal_conditional_branch (rtx_insn *,
+							      rtx *,
+							      bool);
+extern const char *loongarch_output_division (const char *, rtx *);
+extern const char *loongarch_output_probe_stack_range (rtx, rtx, rtx);
+extern bool loongarch_hard_regno_rename_ok (unsigned int, unsigned int);
+extern int loongarch_dspalu_bypass_p (rtx, rtx);
+extern rtx loongarch_prefetch_cookie (rtx, rtx);
+
+extern bool loongarch_global_symbol_p (const_rtx);
+extern bool loongarch_global_symbol_noweak_p (const_rtx);
+extern bool loongarch_weak_symbol_p (const_rtx);
+extern bool loongarch_symbol_binds_local_p (const_rtx);
+
+extern const char *current_section_name (void);
+extern unsigned int current_section_flags (void);
+extern bool loongarch_use_ins_ext_p (rtx, HOST_WIDE_INT, HOST_WIDE_INT);
+
+union loongarch_gen_fn_ptrs
+{
+  rtx (*fn_8) (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+  rtx (*fn_7) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+  rtx (*fn_6) (rtx, rtx, rtx, rtx, rtx, rtx);
+  rtx (*fn_5) (rtx, rtx, rtx, rtx, rtx);
+  rtx (*fn_4) (rtx, rtx, rtx, rtx);
+};
+
+extern void loongarch_expand_atomic_qihi (union loongarch_gen_fn_ptrs,
+					  rtx, rtx, rtx, rtx, rtx);
+
+extern bool loongarch_signed_immediate_p (unsigned HOST_WIDE_INT, int, int);
+extern bool loongarch_unsigned_immediate_p (unsigned HOST_WIDE_INT, int, int);
+extern bool loongarch_12bit_offset_address_p (rtx, machine_mode);
+extern bool loongarch_14bit_shifted_offset_address_p (rtx, machine_mode);
+extern bool loongarch_base_index_address_p (rtx, machine_mode);
+extern rtx loongarch_expand_thread_pointer (rtx);
+
+extern bool loongarch_eh_uses (unsigned int);
+extern bool loongarch_epilogue_uses (unsigned int);
+extern bool loongarch_load_store_bonding_p (rtx *, machine_mode, bool);
+extern bool loongarch_split_symbol_type (enum loongarch_symbol_type);
+
+typedef rtx (*mulsidi3_gen_fn) (rtx, rtx, rtx);
+
+extern void loongarch_register_frame_header_opt (void);
+
+/* Routines implemented in loongarch-c.c.  */
+void loongarch_cpu_cpp_builtins (cpp_reader *);
+
+extern void loongarch_init_builtins (void);
+extern void loongarch_atomic_assign_expand_fenv (tree *, tree *, tree *);
+extern tree loongarch_builtin_decl (unsigned int, bool);
+extern rtx loongarch_expand_builtin (tree, rtx, rtx subtarget ATTRIBUTE_UNUSED,
+				     machine_mode, int);
+extern tree loongarch_build_builtin_va_list (void);
+
+#endif /* ! GCC_LOONGARCH_PROTOS_H */
diff --git a/gcc/config/loongarch/loongarch-str.h b/gcc/config/loongarch/loongarch-str.h
new file mode 100644
index 0000000..0e8889b
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-str.h
@@ -0,0 +1,59 @@
+/* Generated automatically by "genstr" from "loongarch-strings".
+   Please do not edit this file directly.
+
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef LOONGARCH_STR_H
+#define LOONGARCH_STR_H
+
+#define OPTSTR_ARCH "arch"
+#define OPTSTR_TUNE "tune"
+
+#define STR_CPU_NATIVE "native"
+#define STR_CPU_LOONGARCH64 "loongarch64"
+#define STR_CPU_LA464 "la464"
+
+#define STR_ISA_BASE_LA64V100 "la64"
+
+#define OPTSTR_ISA_EXT_FPU "fpu"
+#define STR_ISA_EXT_NOFPU "none"
+#define STR_ISA_EXT_FPU0 "0"
+#define STR_ISA_EXT_FPU32 "32"
+#define STR_ISA_EXT_FPU64 "64"
+
+#define OPTSTR_SOFT_FLOAT "soft-float"
+#define OPTSTR_SINGLE_FLOAT "single-float"
+#define OPTSTR_DOUBLE_FLOAT "double-float"
+
+#define OPTSTR_ABI_BASE "abi"
+#define STR_ABI_BASE_LP64D "lp64d"
+#define STR_ABI_BASE_LP64F "lp64f"
+#define STR_ABI_BASE_LP64S "lp64s"
+
+#define STR_ABI_EXT_BASE "base"
+
+#define OPTSTR_CMODEL "cmodel"
+#define STR_CMODEL_NORMAL "normal"
+#define STR_CMODEL_TINY "tiny"
+#define STR_CMODEL_TS "tiny-static"
+#define STR_CMODEL_LARGE "large"
+#define STR_CMODEL_EXTREME "extreme"
+
+#endif /* LOONGARCH_STR_H */
diff --git a/gcc/config/loongarch/loongarch-tune.h b/gcc/config/loongarch/loongarch-tune.h
new file mode 100644
index 0000000..6f3530f
--- /dev/null
+++ b/gcc/config/loongarch/loongarch-tune.h
@@ -0,0 +1,50 @@
+/* Definitions for microarchitecture-related data structures.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef LOONGARCH_TUNE_H
+#define LOONGARCH_TUNE_H
+
+/* RTX costs of various operations on the different architectures.  */
+struct loongarch_rtx_cost_data
+{
+  unsigned short fp_add;
+  unsigned short fp_mult_sf;
+  unsigned short fp_mult_df;
+  unsigned short fp_div_sf;
+  unsigned short fp_div_df;
+  unsigned short int_mult_si;
+  unsigned short int_mult_di;
+  unsigned short int_div_si;
+  unsigned short int_div_di;
+  unsigned short branch_cost;
+  unsigned short memory_latency;
+};
+
+/* Costs to use when optimizing for size.  */
+extern const struct loongarch_rtx_cost_data loongarch_rtx_cost_optimize_size;
+
+/* Cache size record of known processor models.  */
+struct loongarch_cache {
+    int l1d_line_size;  /* bytes */
+    int l1d_size;       /* KiB */
+    int l2d_size;       /* kiB */
+};
+
+#endif /* LOONGARCH_TUNE_H */
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
new file mode 100644
index 0000000..80046b6
--- /dev/null
+++ b/gcc/config/loongarch/loongarch.cc
@@ -0,0 +1,5950 @@
+/* Subroutines used for LoongArch code generation.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+   Based on MIPS and RISC-V target for GNU compiler.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "gimple.h"
+#include "cfghooks.h"
+#include "df.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "attribs.h"
+#include "optabs.h"
+#include "regs.h"
+#include "emit-rtl.h"
+#include "recog.h"
+#include "cgraph.h"
+#include "diagnostic.h"
+#include "insn-attr.h"
+#include "output.h"
+#include "alias.h"
+#include "fold-const.h"
+#include "varasm.h"
+#include "stor-layout.h"
+#include "calls.h"
+#include "explow.h"
+#include "expr.h"
+#include "libfuncs.h"
+#include "reload.h"
+#include "common/common-target.h"
+#include "langhooks.h"
+#include "cfgrtl.h"
+#include "cfganal.h"
+#include "sched-int.h"
+#include "gimplify.h"
+#include "target-globals.h"
+#include "tree-pass.h"
+#include "context.h"
+#include "builtins.h"
+#include "rtl-iter.h"
+
+/* This file should be included last.  */
+#include "target-def.h"
+
+/* True if X is an UNSPEC wrapper around a SYMBOL_REF or LABEL_REF.  */
+#define UNSPEC_ADDRESS_P(X)					\
+  (GET_CODE (X) == UNSPEC					\
+   && XINT (X, 1) >= UNSPEC_ADDRESS_FIRST			\
+   && XINT (X, 1) < UNSPEC_ADDRESS_FIRST + NUM_SYMBOL_TYPES)
+
+/* Extract the symbol or label from UNSPEC wrapper X.  */
+#define UNSPEC_ADDRESS(X) XVECEXP (X, 0, 0)
+
+/* Extract the symbol type from UNSPEC wrapper X.  */
+#define UNSPEC_ADDRESS_TYPE(X) \
+  ((enum loongarch_symbol_type) (XINT (X, 1) - UNSPEC_ADDRESS_FIRST))
+
+/* True if INSN is a loongarch.md pattern or asm statement.  */
+/* ???	This test exists through the compiler, perhaps it should be
+   moved to rtl.h.  */
+#define USEFUL_INSN_P(INSN)						\
+  (NONDEBUG_INSN_P (INSN)						\
+   && GET_CODE (PATTERN (INSN)) != USE					\
+   && GET_CODE (PATTERN (INSN)) != CLOBBER)
+
+/* True if bit BIT is set in VALUE.  */
+#define BITSET_P(VALUE, BIT) (((VALUE) & (1 << (BIT))) != 0)
+
+/* Classifies an address.
+
+   ADDRESS_REG
+       A natural register + offset address.  The register satisfies
+       loongarch_valid_base_register_p and the offset is a const_arith_operand.
+
+   ADDRESS_REG_REG
+       A base register indexed by (optionally scaled) register.
+
+   ADDRESS_CONST_INT
+       A signed 16-bit constant address.
+
+   ADDRESS_SYMBOLIC:
+       A constant symbolic address.  */
+enum loongarch_address_type
+{
+  ADDRESS_REG,
+  ADDRESS_REG_REG,
+  ADDRESS_CONST_INT,
+  ADDRESS_SYMBOLIC
+};
+
+
+/* Information about an address described by loongarch_address_type.
+
+   ADDRESS_CONST_INT
+       No fields are used.
+
+   ADDRESS_REG
+       REG is the base register and OFFSET is the constant offset.
+
+   ADDRESS_REG_REG
+       A base register indexed by (optionally scaled) register.
+
+   ADDRESS_SYMBOLIC
+       SYMBOL_TYPE is the type of symbol that the address references.  */
+struct loongarch_address_info
+{
+  enum loongarch_address_type type;
+  rtx reg;
+  rtx offset;
+  enum loongarch_symbol_type symbol_type;
+};
+
+/* Method of loading instant numbers:
+
+   METHOD_NORMAL:
+     Load 0-31 bit of the immediate number.
+
+   METHOD_LU32I:
+     Load 32-51 bit of the immediate number.
+
+   METHOD_LU52I:
+     Load 52-63 bit of the immediate number.
+
+   METHOD_INSV:
+     immediate like 0xfff00000fffffxxx
+   */
+enum loongarch_load_imm_method
+{
+  METHOD_NORMAL,
+  METHOD_LU32I,
+  METHOD_LU52I,
+  METHOD_INSV
+};
+
+struct loongarch_integer_op
+{
+  enum rtx_code code;
+  unsigned HOST_WIDE_INT value;
+  enum loongarch_load_imm_method method;
+};
+
+/* The largest number of operations needed to load an integer constant.
+   The worst accepted case for 64-bit constants is LU12I.W,LU32I.D,LU52I.D,ORI
+   or LU12I.W,LU32I.D,LU52I.D,ADDI.D DECL_ASSEMBLER_NAME.  */
+#define LARCH_MAX_INTEGER_OPS 4
+
+/* Arrays that map GCC register numbers to debugger register numbers.  */
+int loongarch_dwarf_regno[FIRST_PSEUDO_REGISTER];
+
+/* Index [M][R] is true if register R is allowed to hold a value of mode M.  */
+static bool loongarch_hard_regno_mode_ok_p[MAX_MACHINE_MODE]
+					  [FIRST_PSEUDO_REGISTER];
+
+/* Index C is true if character C is a valid PRINT_OPERAND punctation
+   character.  */
+static bool loongarch_print_operand_punct[256];
+
+/* Cached value of can_issue_more.  This is cached in loongarch_variable_issue
+   hook and returned from loongarch_sched_reorder2.  */
+static int cached_can_issue_more;
+
+/* Index R is the smallest register class that contains register R.  */
+const enum reg_class loongarch_regno_to_class[FIRST_PSEUDO_REGISTER] = {
+    GR_REGS,	     GR_REGS,	      GR_REGS,	       GR_REGS,
+    JIRL_REGS,       JIRL_REGS,       JIRL_REGS,       JIRL_REGS,
+    JIRL_REGS,       JIRL_REGS,       JIRL_REGS,       JIRL_REGS,
+    SIBCALL_REGS,    SIBCALL_REGS,    SIBCALL_REGS,    SIBCALL_REGS,
+    SIBCALL_REGS,    SIBCALL_REGS,    SIBCALL_REGS,    SIBCALL_REGS,
+    SIBCALL_REGS,    GR_REGS,	      GR_REGS,	       JIRL_REGS,
+    JIRL_REGS,       JIRL_REGS,       JIRL_REGS,       JIRL_REGS,
+    JIRL_REGS,       JIRL_REGS,       JIRL_REGS,       JIRL_REGS,
+
+    FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
+    FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
+    FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
+    FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
+    FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
+    FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
+    FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
+    FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
+    FCC_REGS,	FCC_REGS,	FCC_REGS,	FCC_REGS,
+    FCC_REGS,	FCC_REGS,	FCC_REGS,	FCC_REGS,
+    FRAME_REGS,	FRAME_REGS
+};
+
+/* Which cost information to use.  */
+static const struct loongarch_rtx_cost_data *loongarch_cost;
+
+/* Information about a single argument.  */
+struct loongarch_arg_info
+{
+  /* True if the argument is at least partially passed on the stack.  */
+  bool stack_p;
+
+  /* The number of integer registers allocated to this argument.  */
+  unsigned int num_gprs;
+
+  /* The offset of the first register used, provided num_gprs is nonzero.
+     If passed entirely on the stack, the value is MAX_ARGS_IN_REGISTERS.  */
+  unsigned int gpr_offset;
+
+  /* The number of floating-point registers allocated to this argument.  */
+  unsigned int num_fprs;
+
+  /* The offset of the first register used, provided num_fprs is nonzero.  */
+  unsigned int fpr_offset;
+};
+
+/* Invoke MACRO (COND) for each fcmp.cond.{s/d} condition.  */
+#define LARCH_FP_CONDITIONS(MACRO) \
+  MACRO (f),	\
+  MACRO (un),	\
+  MACRO (eq),	\
+  MACRO (ueq),	\
+  MACRO (olt),	\
+  MACRO (ult),	\
+  MACRO (ole),	\
+  MACRO (ule),	\
+  MACRO (sf),	\
+  MACRO (ngle),	\
+  MACRO (seq),	\
+  MACRO (ngl),	\
+  MACRO (lt),	\
+  MACRO (nge),	\
+  MACRO (le),	\
+  MACRO (ngt)
+
+/* Enumerates the codes above as LARCH_FP_COND_<X>.  */
+#define DECLARE_LARCH_COND(X) LARCH_FP_COND_##X
+enum loongarch_fp_condition
+{
+  LARCH_FP_CONDITIONS (DECLARE_LARCH_COND)
+};
+#undef DECLARE_LARCH_COND
+
+/* Index X provides the string representation of LARCH_FP_COND_<X>.  */
+#define STRINGIFY(X) #X
+const char *const
+loongarch_fp_conditions[16]= {LARCH_FP_CONDITIONS (STRINGIFY)};
+#undef STRINGIFY
+
+/* Implement TARGET_FUNCTION_ARG_BOUNDARY.  Every parameter gets at
+   least PARM_BOUNDARY bits of alignment, but will be given anything up
+   to PREFERRED_STACK_BOUNDARY bits if the type requires it.  */
+
+static unsigned int
+loongarch_function_arg_boundary (machine_mode mode, const_tree type)
+{
+  unsigned int alignment;
+
+  /* Use natural alignment if the type is not aggregate data.  */
+  if (type && !AGGREGATE_TYPE_P (type))
+    alignment = TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
+  else
+    alignment = type ? TYPE_ALIGN (type) : GET_MODE_ALIGNMENT (mode);
+
+  return MIN (PREFERRED_STACK_BOUNDARY, MAX (PARM_BOUNDARY, alignment));
+}
+
+/* If MODE represents an argument that can be passed or returned in
+   floating-point registers, return the number of registers, else 0.  */
+
+static unsigned
+loongarch_pass_mode_in_fpr_p (machine_mode mode)
+{
+  if (GET_MODE_UNIT_SIZE (mode) <= UNITS_PER_FP_ARG)
+    {
+      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+	return 1;
+
+      if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
+	return 2;
+    }
+
+  return 0;
+}
+
+typedef struct
+{
+  const_tree type;
+  HOST_WIDE_INT offset;
+} loongarch_aggregate_field;
+
+/* Identify subfields of aggregates that are candidates for passing in
+   floating-point registers.  */
+
+static int
+loongarch_flatten_aggregate_field (const_tree type,
+				   loongarch_aggregate_field fields[2], int n,
+				   HOST_WIDE_INT offset)
+{
+  switch (TREE_CODE (type))
+    {
+    case RECORD_TYPE:
+      /* Can't handle incomplete types nor sizes that are not fixed.  */
+      if (!COMPLETE_TYPE_P (type)
+	  || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST
+	  || !tree_fits_uhwi_p (TYPE_SIZE (type)))
+	return -1;
+
+      for (tree f = TYPE_FIELDS (type); f; f = DECL_CHAIN (f))
+	if (TREE_CODE (f) == FIELD_DECL)
+	  {
+	    if (!TYPE_P (TREE_TYPE (f)))
+	      return -1;
+
+	    if (DECL_SIZE (f) && integer_zerop (DECL_SIZE (f)))
+	      continue;
+
+	    HOST_WIDE_INT pos = offset + int_byte_position (f);
+	    n = loongarch_flatten_aggregate_field (TREE_TYPE (f), fields, n,
+						   pos);
+	    if (n < 0)
+	      return -1;
+	  }
+      return n;
+
+    case ARRAY_TYPE:
+      {
+	HOST_WIDE_INT n_elts;
+	loongarch_aggregate_field subfields[2];
+	tree index = TYPE_DOMAIN (type);
+	tree elt_size = TYPE_SIZE_UNIT (TREE_TYPE (type));
+	int n_subfields = loongarch_flatten_aggregate_field (TREE_TYPE (type),
+							     subfields, 0,
+							     offset);
+
+	/* Can't handle incomplete types nor sizes that are not fixed.  */
+	if (n_subfields <= 0
+	    || !COMPLETE_TYPE_P (type)
+	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST
+	    || !index
+	    || !TYPE_MAX_VALUE (index)
+	    || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
+	    || !TYPE_MIN_VALUE (index)
+	    || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
+	    || !tree_fits_uhwi_p (elt_size))
+	  return -1;
+
+	n_elts = 1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
+		 - tree_to_uhwi (TYPE_MIN_VALUE (index));
+	gcc_assert (n_elts >= 0);
+
+	for (HOST_WIDE_INT i = 0; i < n_elts; i++)
+	  for (int j = 0; j < n_subfields; j++)
+	    {
+	      if (n >= 2)
+		return -1;
+
+	      fields[n] = subfields[j];
+	      fields[n++].offset += i * tree_to_uhwi (elt_size);
+	    }
+
+	return n;
+      }
+
+    case COMPLEX_TYPE:
+      {
+	/* Complex type need consume 2 field, so n must be 0.  */
+	if (n != 0)
+	  return -1;
+
+	HOST_WIDE_INT elt_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (type)));
+
+	if (elt_size <= UNITS_PER_FP_ARG)
+	  {
+	    fields[0].type = TREE_TYPE (type);
+	    fields[0].offset = offset;
+	    fields[1].type = TREE_TYPE (type);
+	    fields[1].offset = offset + elt_size;
+
+	    return 2;
+	  }
+
+	return -1;
+      }
+
+    default:
+      if (n < 2
+	  && ((SCALAR_FLOAT_TYPE_P (type)
+	       && GET_MODE_SIZE (TYPE_MODE (type)) <= UNITS_PER_FP_ARG)
+	      || (INTEGRAL_TYPE_P (type)
+		  && GET_MODE_SIZE (TYPE_MODE (type)) <= UNITS_PER_WORD)))
+	{
+	  fields[n].type = type;
+	  fields[n].offset = offset;
+	  return n + 1;
+	}
+      else
+	return -1;
+    }
+}
+
+/* Identify candidate aggregates for passing in floating-point registers.
+   Candidates have at most two fields after flattening.  */
+
+static int
+loongarch_flatten_aggregate_argument (const_tree type,
+				      loongarch_aggregate_field fields[2])
+{
+  if (!type || TREE_CODE (type) != RECORD_TYPE)
+    return -1;
+
+  return loongarch_flatten_aggregate_field (type, fields, 0, 0);
+}
+
+/* See whether TYPE is a record whose fields should be returned in one or
+   two floating-point registers.  If so, populate FIELDS accordingly.  */
+
+static unsigned
+loongarch_pass_aggregate_num_fpr (const_tree type,
+					loongarch_aggregate_field fields[2])
+{
+  int n = loongarch_flatten_aggregate_argument (type, fields);
+
+  for (int i = 0; i < n; i++)
+    if (!SCALAR_FLOAT_TYPE_P (fields[i].type))
+      return 0;
+
+  return n > 0 ? n : 0;
+}
+
+/* See whether TYPE is a record whose fields should be returned in one
+   floating-point register and one integer register.  If so, populate
+   FIELDS accordingly.  */
+
+static bool
+loongarch_pass_aggregate_in_fpr_and_gpr_p (const_tree type,
+					   loongarch_aggregate_field fields[2])
+{
+  unsigned num_int = 0, num_float = 0;
+  int n = loongarch_flatten_aggregate_argument (type, fields);
+
+  for (int i = 0; i < n; i++)
+    {
+      num_float += SCALAR_FLOAT_TYPE_P (fields[i].type);
+      num_int += INTEGRAL_TYPE_P (fields[i].type);
+    }
+
+  return num_int == 1 && num_float == 1;
+}
+
+/* Return the representation of an argument passed or returned in an FPR
+   when the value has mode VALUE_MODE and the type has TYPE_MODE.  The
+   two modes may be different for structures like:
+
+   struct __attribute__((packed)) foo { float f; }
+
+   where the SFmode value "f" is passed in REGNO but the struct itself
+   has mode BLKmode.  */
+
+static rtx
+loongarch_pass_fpr_single (machine_mode type_mode, unsigned regno,
+			   machine_mode value_mode,
+			   HOST_WIDE_INT offset)
+{
+  rtx x = gen_rtx_REG (value_mode, regno);
+
+  if (type_mode != value_mode)
+    {
+      x = gen_rtx_EXPR_LIST (VOIDmode, x, GEN_INT (offset));
+      x = gen_rtx_PARALLEL (type_mode, gen_rtvec (1, x));
+    }
+  return x;
+}
+
+/* Pass or return a composite value in the FPR pair REGNO and REGNO + 1.
+   MODE is the mode of the composite.  MODE1 and OFFSET1 are the mode and
+   byte offset for the first value, likewise MODE2 and OFFSET2 for the
+   second value.  */
+
+static rtx
+loongarch_pass_fpr_pair (machine_mode mode, unsigned regno1,
+			 machine_mode mode1, HOST_WIDE_INT offset1,
+			 unsigned regno2, machine_mode mode2,
+			 HOST_WIDE_INT offset2)
+{
+  return gen_rtx_PARALLEL (
+    mode, gen_rtvec (2,
+		     gen_rtx_EXPR_LIST (VOIDmode, gen_rtx_REG (mode1, regno1),
+					GEN_INT (offset1)),
+		     gen_rtx_EXPR_LIST (VOIDmode, gen_rtx_REG (mode2, regno2),
+					GEN_INT (offset2))));
+}
+
+/* Fill INFO with information about a single argument, and return an
+   RTL pattern to pass or return the argument.  CUM is the cumulative
+   state for earlier arguments.  MODE is the mode of this argument and
+   TYPE is its type (if known).  NAMED is true if this is a named
+   (fixed) argument rather than a variable one.  RETURN_P is true if
+   returning the argument, or false if passing the argument.  */
+
+static rtx
+loongarch_get_arg_info (struct loongarch_arg_info *info,
+			const CUMULATIVE_ARGS *cum, machine_mode mode,
+			const_tree type, bool named, bool return_p)
+{
+  unsigned num_bytes, num_words;
+  unsigned fpr_base = return_p ? FP_RETURN : FP_ARG_FIRST;
+  unsigned gpr_base = return_p ? GP_RETURN : GP_ARG_FIRST;
+  unsigned alignment = loongarch_function_arg_boundary (mode, type);
+
+  memset (info, 0, sizeof (*info));
+  info->gpr_offset = cum->num_gprs;
+  info->fpr_offset = cum->num_fprs;
+
+  if (named)
+    {
+      loongarch_aggregate_field fields[2];
+      unsigned fregno = fpr_base + info->fpr_offset;
+      unsigned gregno = gpr_base + info->gpr_offset;
+
+      /* Pass one- or two-element floating-point aggregates in FPRs.  */
+      if ((info->num_fprs
+	   = loongarch_pass_aggregate_num_fpr (type, fields))
+	  && info->fpr_offset + info->num_fprs <= MAX_ARGS_IN_REGISTERS)
+	switch (info->num_fprs)
+	  {
+	  case 1:
+	    return loongarch_pass_fpr_single (mode, fregno,
+					      TYPE_MODE (fields[0].type),
+					      fields[0].offset);
+
+	  case 2:
+	    return loongarch_pass_fpr_pair (mode, fregno,
+					    TYPE_MODE (fields[0].type),
+					    fields[0].offset,
+					    fregno + 1,
+					    TYPE_MODE (fields[1].type),
+					    fields[1].offset);
+
+	  default:
+	    gcc_unreachable ();
+	  }
+
+      /* Pass real and complex floating-point numbers in FPRs.  */
+      if ((info->num_fprs = loongarch_pass_mode_in_fpr_p (mode))
+	  && info->fpr_offset + info->num_fprs <= MAX_ARGS_IN_REGISTERS)
+	switch (GET_MODE_CLASS (mode))
+	  {
+	  case MODE_FLOAT:
+	    return gen_rtx_REG (mode, fregno);
+
+	  case MODE_COMPLEX_FLOAT:
+	    return loongarch_pass_fpr_pair (mode, fregno,
+					    GET_MODE_INNER (mode), 0,
+					    fregno + 1, GET_MODE_INNER (mode),
+					    GET_MODE_UNIT_SIZE (mode));
+
+	  default:
+	    gcc_unreachable ();
+	  }
+
+      /* Pass structs with one float and one integer in an FPR and a GPR.  */
+      if (loongarch_pass_aggregate_in_fpr_and_gpr_p (type, fields)
+	  && info->gpr_offset < MAX_ARGS_IN_REGISTERS
+	  && info->fpr_offset < MAX_ARGS_IN_REGISTERS)
+	{
+	  info->num_gprs = 1;
+	  info->num_fprs = 1;
+
+	  if (!SCALAR_FLOAT_TYPE_P (fields[0].type))
+	    std::swap (fregno, gregno);
+
+	  return loongarch_pass_fpr_pair (mode, fregno,
+					  TYPE_MODE (fields[0].type),
+					  fields[0].offset, gregno,
+					  TYPE_MODE (fields[1].type),
+					  fields[1].offset);
+	}
+    }
+
+  /* Work out the size of the argument.  */
+  num_bytes = type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode);
+  num_words = (num_bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+
+  /* Doubleword-aligned varargs start on an even register boundary.  */
+  if (!named && num_bytes != 0 && alignment > BITS_PER_WORD)
+    info->gpr_offset += info->gpr_offset & 1;
+
+  /* Partition the argument between registers and stack.  */
+  info->num_fprs = 0;
+  info->num_gprs = MIN (num_words, MAX_ARGS_IN_REGISTERS - info->gpr_offset);
+  info->stack_p = (num_words - info->num_gprs) != 0;
+
+  if (info->num_gprs || return_p)
+    return gen_rtx_REG (mode, gpr_base + info->gpr_offset);
+
+  return NULL_RTX;
+}
+
+/* Implement TARGET_FUNCTION_ARG.  */
+
+static rtx
+loongarch_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
+{
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  struct loongarch_arg_info info;
+
+  if (arg.end_marker_p ())
+    return NULL;
+
+  return loongarch_get_arg_info (&info, cum, arg.mode, arg.type, arg.named,
+				 false);
+}
+
+/* Implement TARGET_FUNCTION_ARG_ADVANCE.  */
+
+static void
+loongarch_function_arg_advance (cumulative_args_t cum_v,
+				const function_arg_info &arg)
+{
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  struct loongarch_arg_info info;
+
+  loongarch_get_arg_info (&info, cum, arg.mode, arg.type, arg.named, false);
+
+  /* Advance the register count.  This has the effect of setting
+     num_gprs to MAX_ARGS_IN_REGISTERS if a doubleword-aligned
+     argument required us to skip the final GPR and pass the whole
+     argument on the stack.  */
+  cum->num_fprs = info.fpr_offset + info.num_fprs;
+  cum->num_gprs = info.gpr_offset + info.num_gprs;
+}
+
+/* Implement TARGET_ARG_PARTIAL_BYTES.  */
+
+static int
+loongarch_arg_partial_bytes (cumulative_args_t cum,
+			     const function_arg_info &generic_arg)
+{
+  struct loongarch_arg_info arg;
+
+  loongarch_get_arg_info (&arg, get_cumulative_args (cum), generic_arg.mode,
+			  generic_arg.type, generic_arg.named, false);
+  return arg.stack_p ? arg.num_gprs * UNITS_PER_WORD : 0;
+}
+
+/* Implement FUNCTION_VALUE and LIBCALL_VALUE.  For normal calls,
+   VALTYPE is the return type and MODE is VOIDmode.  For libcalls,
+   VALTYPE is null and MODE is the mode of the return value.  */
+
+static rtx
+loongarch_function_value_1 (const_tree type, const_tree func,
+			    machine_mode mode)
+{
+  struct loongarch_arg_info info;
+  CUMULATIVE_ARGS args;
+
+  if (type)
+    {
+      int unsigned_p = TYPE_UNSIGNED (type);
+
+      mode = TYPE_MODE (type);
+
+      /* Since TARGET_PROMOTE_FUNCTION_MODE unconditionally promotes,
+	 return values, promote the mode here too.  */
+      mode = promote_function_mode (type, mode, &unsigned_p, func, 1);
+    }
+
+  memset (&args, 0, sizeof (args));
+  return loongarch_get_arg_info (&info, &args, mode, type, true, true);
+}
+
+
+/* Implement TARGET_FUNCTION_VALUE.  */
+
+static rtx
+loongarch_function_value (const_tree valtype, const_tree fn_decl_or_type,
+			  bool outgoing ATTRIBUTE_UNUSED)
+{
+  return loongarch_function_value_1 (valtype, fn_decl_or_type, VOIDmode);
+}
+
+/* Implement TARGET_LIBCALL_VALUE.  */
+
+static rtx
+loongarch_libcall_value (machine_mode mode, const_rtx fun ATTRIBUTE_UNUSED)
+{
+  return loongarch_function_value_1 (NULL_TREE, NULL_TREE, mode);
+}
+
+
+/* Implement TARGET_PASS_BY_REFERENCE.  */
+
+static bool
+loongarch_pass_by_reference (cumulative_args_t cum_v,
+			     const function_arg_info &arg)
+{
+  HOST_WIDE_INT size = arg.type_size_in_bytes ();
+  struct loongarch_arg_info info;
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+
+  /* ??? std_gimplify_va_arg_expr passes NULL for cum.  Fortunately, we
+     never pass variadic arguments in floating-point registers, so we can
+     avoid the call to loongarch_get_arg_info in this case.  */
+  if (cum != NULL)
+    {
+      /* Don't pass by reference if we can use a floating-point register.  */
+      loongarch_get_arg_info (&info, cum, arg.mode, arg.type, arg.named,
+			      false);
+      if (info.num_fprs)
+	return false;
+    }
+
+  /* Pass by reference if the data do not fit in two integer registers.  */
+  return !IN_RANGE (size, 0, 2 * UNITS_PER_WORD);
+}
+
+/* Implement TARGET_RETURN_IN_MEMORY.  */
+
+static bool
+loongarch_return_in_memory (const_tree type,
+			    const_tree fndecl ATTRIBUTE_UNUSED)
+{
+  CUMULATIVE_ARGS args;
+  cumulative_args_t cum = pack_cumulative_args (&args);
+
+  /* The rules for returning in memory are the same as for passing the
+     first named argument by reference.  */
+  memset (&args, 0, sizeof (args));
+  function_arg_info arg (const_cast<tree> (type), /*named=*/true);
+  return loongarch_pass_by_reference (cum, arg);
+}
+
+/* Implement TARGET_SETUP_INCOMING_VARARGS.  */
+
+static void
+loongarch_setup_incoming_varargs (cumulative_args_t cum,
+				  const function_arg_info &arg,
+				  int *pretend_size ATTRIBUTE_UNUSED,
+				  int no_rtl)
+{
+  CUMULATIVE_ARGS local_cum;
+  int gp_saved;
+
+  /* The caller has advanced CUM up to, but not beyond, the last named
+     argument.  Advance a local copy of CUM past the last "real" named
+     argument, to find out how many registers are left over.  */
+  local_cum = *get_cumulative_args (cum);
+  loongarch_function_arg_advance (pack_cumulative_args (&local_cum), arg);
+
+  /* Found out how many registers we need to save.  */
+  gp_saved = MAX_ARGS_IN_REGISTERS - local_cum.num_gprs;
+
+  if (!no_rtl && gp_saved > 0)
+    {
+      rtx ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
+			       REG_PARM_STACK_SPACE (cfun->decl)
+				 - gp_saved * UNITS_PER_WORD);
+      rtx mem = gen_frame_mem (BLKmode, ptr);
+      set_mem_alias_set (mem, get_varargs_alias_set ());
+
+      move_block_from_reg (local_cum.num_gprs + GP_ARG_FIRST, mem, gp_saved);
+    }
+  if (REG_PARM_STACK_SPACE (cfun->decl) == 0)
+    cfun->machine->varargs_size = gp_saved * UNITS_PER_WORD;
+}
+
+/* Make the last instruction frame-related and note that it performs
+   the operation described by FRAME_PATTERN.  */
+
+static void
+loongarch_set_frame_expr (rtx frame_pattern)
+{
+  rtx insn;
+
+  insn = get_last_insn ();
+  RTX_FRAME_RELATED_P (insn) = 1;
+  REG_NOTES (insn) = alloc_EXPR_LIST (REG_FRAME_RELATED_EXPR, frame_pattern,
+				      REG_NOTES (insn));
+}
+
+/* Return a frame-related rtx that stores REG at MEM.
+   REG must be a single register.  */
+
+static rtx
+loongarch_frame_set (rtx mem, rtx reg)
+{
+  rtx set = gen_rtx_SET (mem, reg);
+  RTX_FRAME_RELATED_P (set) = 1;
+  return set;
+}
+
+/* Return true if the current function must save register REGNO.  */
+
+static bool
+loongarch_save_reg_p (unsigned int regno)
+{
+  bool call_saved = !global_regs[regno] && !call_used_regs[regno];
+  bool might_clobber
+    = crtl->saves_all_registers || df_regs_ever_live_p (regno);
+
+  if (call_saved && might_clobber)
+    return true;
+
+  if (regno == HARD_FRAME_POINTER_REGNUM && frame_pointer_needed)
+    return true;
+
+  if (regno == RETURN_ADDR_REGNUM && crtl->calls_eh_return)
+    return true;
+
+  return false;
+}
+
+/* Determine which GPR save/restore routine to call.  */
+
+static unsigned
+loongarch_save_libcall_count (unsigned mask)
+{
+  for (unsigned n = GP_REG_LAST; n > GP_REG_FIRST; n--)
+    if (BITSET_P (mask, n))
+      return CALLEE_SAVED_REG_NUMBER (n) + 1;
+  abort ();
+}
+
+/* Populate the current function's loongarch_frame_info structure.
+
+   LoongArch stack frames grown downward.  High addresses are at the top.
+
+     +-------------------------------+
+     |				     |
+     |  incoming stack arguments     |
+     |				     |
+     +-------------------------------+ <-- incoming stack pointer
+     |				     |
+     |  callee-allocated save area   |
+     |  for arguments that are       |
+     |  split between registers and  |
+     |  the stack		     |
+     |				     |
+     +-------------------------------+ <-- arg_pointer_rtx (virtual)
+     |				     |
+     |  callee-allocated save area   |
+     |  for register varargs	     |
+     |				     |
+     +-------------------------------+ <-- hard_frame_pointer_rtx;
+     |				     |     stack_pointer_rtx + gp_sp_offset
+     |  GPR save area		     |       + UNITS_PER_WORD
+     |				     |
+     +-------------------------------+ <-- stack_pointer_rtx + fp_sp_offset
+     |				     |       + UNITS_PER_HWVALUE
+     |  FPR save area		     |
+     |				     |
+     +-------------------------------+ <-- frame_pointer_rtx (virtual)
+     |				     |
+     |  local variables		     |
+     |				     |
+   P +-------------------------------+
+     |				     |
+     |  outgoing stack arguments     |
+     |				     |
+     +-------------------------------+ <-- stack_pointer_rtx
+
+   Dynamic stack allocations such as alloca insert data at point P.
+   They decrease stack_pointer_rtx but leave frame_pointer_rtx and
+   hard_frame_pointer_rtx unchanged.  */
+
+static void
+loongarch_compute_frame_info (void)
+{
+  struct loongarch_frame_info *frame;
+  HOST_WIDE_INT offset;
+  unsigned int regno, i, num_x_saved = 0, num_f_saved = 0;
+
+  frame = &cfun->machine->frame;
+  memset (frame, 0, sizeof (*frame));
+
+  /* Find out which GPRs we need to save.  */
+  for (regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
+    if (loongarch_save_reg_p (regno))
+      frame->mask |= 1 << (regno - GP_REG_FIRST), num_x_saved++;
+
+  /* If this function calls eh_return, we must also save and restore the
+     EH data registers.  */
+  if (crtl->calls_eh_return)
+    for (i = 0; (regno = EH_RETURN_DATA_REGNO (i)) != INVALID_REGNUM; i++)
+      frame->mask |= 1 << (regno - GP_REG_FIRST), num_x_saved++;
+
+  /* Find out which FPRs we need to save.  This loop must iterate over
+     the same space as its companion in loongarch_for_each_saved_reg.  */
+  if (TARGET_HARD_FLOAT)
+    for (regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
+      if (loongarch_save_reg_p (regno))
+	frame->fmask |= 1 << (regno - FP_REG_FIRST), num_f_saved++;
+
+  /* At the bottom of the frame are any outgoing stack arguments.  */
+  offset = LARCH_STACK_ALIGN (crtl->outgoing_args_size);
+  /* Next are local stack variables.  */
+  offset += LARCH_STACK_ALIGN (get_frame_size ());
+  /* The virtual frame pointer points above the local variables.  */
+  frame->frame_pointer_offset = offset;
+  /* Next are the callee-saved FPRs.  */
+  if (frame->fmask)
+    offset += LARCH_STACK_ALIGN (num_f_saved * UNITS_PER_FP_REG);
+  frame->fp_sp_offset = offset - UNITS_PER_FP_REG;
+  /* Next are the callee-saved GPRs.  */
+  if (frame->mask)
+    {
+      unsigned x_save_size = LARCH_STACK_ALIGN (num_x_saved * UNITS_PER_WORD);
+      unsigned num_save_restore
+	= 1 + loongarch_save_libcall_count (frame->mask);
+
+      /* Only use save/restore routines if they don't alter the stack size.  */
+      if (LARCH_STACK_ALIGN (num_save_restore * UNITS_PER_WORD) == x_save_size)
+	frame->save_libcall_adjustment = x_save_size;
+
+      offset += x_save_size;
+    }
+  frame->gp_sp_offset = offset - UNITS_PER_WORD;
+  /* The hard frame pointer points above the callee-saved GPRs.  */
+  frame->hard_frame_pointer_offset = offset;
+  /* Above the hard frame pointer is the callee-allocated varags save area.  */
+  offset += LARCH_STACK_ALIGN (cfun->machine->varargs_size);
+  /* Next is the callee-allocated area for pretend stack arguments.  */
+  offset += LARCH_STACK_ALIGN (crtl->args.pretend_args_size);
+  /* Arg pointer must be below pretend args, but must be above alignment
+     padding.  */
+  frame->arg_pointer_offset = offset - crtl->args.pretend_args_size;
+  frame->total_size = offset;
+  /* Next points the incoming stack pointer and any incoming arguments.  */
+
+  /* Only use save/restore routines when the GPRs are atop the frame.  */
+  if (frame->hard_frame_pointer_offset != frame->total_size)
+    frame->save_libcall_adjustment = 0;
+}
+
+/* Implement INITIAL_ELIMINATION_OFFSET.  FROM is either the frame pointer
+   or argument pointer.  TO is either the stack pointer or hard frame
+   pointer.  */
+
+HOST_WIDE_INT
+loongarch_initial_elimination_offset (int from, int to)
+{
+  HOST_WIDE_INT src, dest;
+
+  loongarch_compute_frame_info ();
+
+  if (to == HARD_FRAME_POINTER_REGNUM)
+    dest = cfun->machine->frame.hard_frame_pointer_offset;
+  else if (to == STACK_POINTER_REGNUM)
+    dest = 0; /* The stack pointer is the base of all offsets, hence 0.  */
+  else
+    gcc_unreachable ();
+
+  if (from == FRAME_POINTER_REGNUM)
+    src = cfun->machine->frame.frame_pointer_offset;
+  else if (from == ARG_POINTER_REGNUM)
+    src = cfun->machine->frame.arg_pointer_offset;
+  else
+    gcc_unreachable ();
+
+  return src - dest;
+}
+
+/* A function to save or store a register.  The first argument is the
+   register and the second is the stack slot.  */
+typedef void (*loongarch_save_restore_fn) (rtx, rtx);
+
+/* Use FN to save or restore register REGNO.  MODE is the register's
+   mode and OFFSET is the offset of its save slot from the current
+   stack pointer.  */
+
+static void
+loongarch_save_restore_reg (machine_mode mode, int regno, HOST_WIDE_INT offset,
+			    loongarch_save_restore_fn fn)
+{
+  rtx mem;
+
+  mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx, offset));
+  fn (gen_rtx_REG (mode, regno), mem);
+}
+
+/* Call FN for each register that is saved by the current function.
+   SP_OFFSET is the offset of the current stack pointer from the start
+   of the frame.  */
+
+static void
+loongarch_for_each_saved_reg (HOST_WIDE_INT sp_offset,
+			      loongarch_save_restore_fn fn)
+{
+  HOST_WIDE_INT offset;
+
+  /* Save the link register and s-registers.  */
+  offset = cfun->machine->frame.gp_sp_offset - sp_offset;
+  for (int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
+    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
+      {
+	loongarch_save_restore_reg (word_mode, regno, offset, fn);
+	offset -= UNITS_PER_WORD;
+      }
+
+  /* This loop must iterate over the same space as its companion in
+     loongarch_compute_frame_info.  */
+  offset = cfun->machine->frame.fp_sp_offset - sp_offset;
+  for (int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
+    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
+      {
+	machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
+
+	loongarch_save_restore_reg (mode, regno, offset, fn);
+	offset -= GET_MODE_SIZE (mode);
+      }
+}
+
+/* Emit a move from SRC to DEST.  Assume that the move expanders can
+   handle all moves if !can_create_pseudo_p ().  The distinction is
+   important because, unlike emit_move_insn, the move expanders know
+   how to force Pmode objects into the constant pool even when the
+   constant pool address is not itself legitimate.  */
+
+rtx
+loongarch_emit_move (rtx dest, rtx src)
+{
+  return (can_create_pseudo_p () ? emit_move_insn (dest, src)
+				 : emit_move_insn_1 (dest, src));
+}
+
+/* Save register REG to MEM.  Make the instruction frame-related.  */
+
+static void
+loongarch_save_reg (rtx reg, rtx mem)
+{
+  loongarch_emit_move (mem, reg);
+  loongarch_set_frame_expr (loongarch_frame_set (mem, reg));
+}
+
+/* Restore register REG from MEM.  */
+
+static void
+loongarch_restore_reg (rtx reg, rtx mem)
+{
+  rtx insn = loongarch_emit_move (reg, mem);
+  rtx dwarf = NULL_RTX;
+  dwarf = alloc_reg_note (REG_CFA_RESTORE, reg, dwarf);
+  REG_NOTES (insn) = dwarf;
+
+  RTX_FRAME_RELATED_P (insn) = 1;
+}
+
+/* For stack frames that can't be allocated with a single ADDI instruction,
+   compute the best value to initially allocate.  It must at a minimum
+   allocate enough space to spill the callee-saved registers.  */
+
+static HOST_WIDE_INT
+loongarch_first_stack_step (struct loongarch_frame_info *frame)
+{
+  if (IMM12_OPERAND (frame->total_size))
+    return frame->total_size;
+
+  HOST_WIDE_INT min_first_step
+    = LARCH_STACK_ALIGN (frame->total_size - frame->fp_sp_offset);
+  HOST_WIDE_INT max_first_step = IMM_REACH / 2 - PREFERRED_STACK_BOUNDARY / 8;
+  HOST_WIDE_INT min_second_step = frame->total_size - max_first_step;
+  gcc_assert (min_first_step <= max_first_step);
+
+  /* As an optimization, use the least-significant bits of the total frame
+     size, so that the second adjustment step is just LU12I + ADD.  */
+  if (!IMM12_OPERAND (min_second_step)
+      && frame->total_size % IMM_REACH < IMM_REACH / 2
+      && frame->total_size % IMM_REACH >= min_first_step)
+    return frame->total_size % IMM_REACH;
+
+  return max_first_step;
+}
+
+static void
+loongarch_emit_stack_tie (void)
+{
+  emit_insn (gen_stack_tie (Pmode, stack_pointer_rtx, hard_frame_pointer_rtx));
+}
+
+#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
+
+#if PROBE_INTERVAL > 16384
+#error Cannot use indexed addressing mode for stack probing
+#endif
+
+/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
+   inclusive.  These are offsets from the current stack pointer.  */
+
+static void
+loongarch_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
+{
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  */
+  if ((TARGET_64BIT && (first + size <= 32768))
+      || (!TARGET_64BIT && (first + size <= 2048)))
+    {
+      HOST_WIDE_INT i;
+
+      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
+	 it exceeds SIZE.  If only one probe is needed, this will not
+	 generate any code.  Then probe at FIRST + SIZE.  */
+      for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
+	emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+					 -(first + i)));
+
+      emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+				       -(first + size)));
+    }
+
+  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+     extra careful with variables wrapping around because we might be at
+     the very top (or the very bottom) of the address space and we have
+     to be able to handle this case properly; in particular, we use an
+     equality test for the loop condition.  */
+  else
+    {
+      HOST_WIDE_INT rounded_size;
+      rtx r13 = LARCH_PROLOGUE_TEMP (Pmode);
+      rtx r12 = LARCH_PROLOGUE_TEMP2 (Pmode);
+      rtx r14 = LARCH_PROLOGUE_TEMP3 (Pmode);
+
+      /* Sanity check for the addressing mode we're going to use.  */
+      gcc_assert (first <= 16384);
+
+
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
+
+      rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
+
+      /* TEST_ADDR = SP + FIRST */
+      if (first != 0)
+	{
+	  emit_move_insn (r14, GEN_INT (first));
+	  emit_insn (gen_rtx_SET (r13, gen_rtx_MINUS (Pmode,
+						      stack_pointer_rtx,
+						      r14)));
+	}
+      else
+	emit_move_insn (r13, stack_pointer_rtx);
+
+      /* Step 2: compute initial and final value of the loop counter.  */
+
+      emit_move_insn (r14, GEN_INT (PROBE_INTERVAL));
+      /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
+      if (rounded_size == 0)
+	emit_move_insn (r12, r13);
+      else
+	{
+	  emit_move_insn (r12, GEN_INT (rounded_size));
+	  emit_insn (gen_rtx_SET (r12, gen_rtx_MINUS (Pmode, r13, r12)));
+	  /* Step 3: the loop
+
+	     do
+	     {
+	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
+	     probe at TEST_ADDR
+	     }
+	     while (TEST_ADDR != LAST_ADDR)
+
+	     probes at FIRST + N * PROBE_INTERVAL for values of N from 1
+	     until it is equal to ROUNDED_SIZE.  */
+
+	  emit_insn (gen_probe_stack_range (Pmode, r13, r13, r12, r14));
+	}
+
+      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
+	 that SIZE is equal to ROUNDED_SIZE.  */
+
+      if (size != rounded_size)
+	{
+	  if (TARGET_64BIT)
+	    emit_stack_probe (plus_constant (Pmode, r12, rounded_size - size));
+	  else
+	    {
+	      HOST_WIDE_INT i;
+	      for (i = 2048; i < (size - rounded_size); i += 2048)
+		{
+		  emit_stack_probe (plus_constant (Pmode, r12, -i));
+		  emit_insn (gen_rtx_SET (r12,
+					  plus_constant (Pmode, r12, -2048)));
+		}
+	      rtx r1 = plus_constant (Pmode, r12,
+				      -(size - rounded_size - i + 2048));
+	      emit_stack_probe (r1);
+	    }
+	}
+    }
+
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
+
+/* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
+   absolute addresses.  */
+const char *
+loongarch_output_probe_stack_range (rtx reg1, rtx reg2, rtx reg3)
+{
+  static int labelno = 0;
+  char loop_lab[32], tmp[64];
+  rtx xops[3];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+
+  /* Loop.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
+  xops[0] = reg1;
+  xops[1] = GEN_INT (-PROBE_INTERVAL);
+  xops[2] = reg3;
+  if (TARGET_64BIT)
+    output_asm_insn ("sub.d\t%0,%0,%2", xops);
+  else
+    output_asm_insn ("sub.w\t%0,%0,%2", xops);
+
+  /* Probe at TEST_ADDR, test if TEST_ADDR == LAST_ADDR and branch.  */
+  xops[1] = reg2;
+  strcpy (tmp, "bne\t%0,%1,");
+  if (TARGET_64BIT)
+    output_asm_insn ("st.d\t$r0,%0,0", xops);
+  else
+    output_asm_insn ("st.w\t$r0,%0,0", xops);
+  output_asm_insn (strcat (tmp, &loop_lab[1]), xops);
+
+  return "";
+}
+
+/* Expand the "prologue" pattern.  */
+
+void
+loongarch_expand_prologue (void)
+{
+  struct loongarch_frame_info *frame = &cfun->machine->frame;
+  HOST_WIDE_INT size = frame->total_size;
+  HOST_WIDE_INT tmp;
+  rtx insn;
+
+  if (flag_stack_usage_info)
+    current_function_static_stack_size = size;
+
+  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
+      || flag_stack_clash_protection)
+    {
+      if (crtl->is_leaf && !cfun->calls_alloca)
+	{
+	  if (size > PROBE_INTERVAL && size > get_stack_check_protect ())
+	    {
+	      tmp = size - get_stack_check_protect ();
+	      loongarch_emit_probe_stack_range (get_stack_check_protect (),
+						tmp);
+	    }
+	}
+      else if (size > 0)
+	loongarch_emit_probe_stack_range (get_stack_check_protect (), size);
+    }
+
+  /* Save the registers.  */
+  if ((frame->mask | frame->fmask) != 0)
+    {
+      HOST_WIDE_INT step1 = MIN (size, loongarch_first_stack_step (frame));
+
+      insn = gen_add3_insn (stack_pointer_rtx, stack_pointer_rtx,
+			    GEN_INT (-step1));
+      RTX_FRAME_RELATED_P (emit_insn (insn)) = 1;
+      size -= step1;
+      loongarch_for_each_saved_reg (size, loongarch_save_reg);
+    }
+
+
+  /* Set up the frame pointer, if we're using one.  */
+  if (frame_pointer_needed)
+    {
+      insn = gen_add3_insn (hard_frame_pointer_rtx, stack_pointer_rtx,
+			    GEN_INT (frame->hard_frame_pointer_offset - size));
+      RTX_FRAME_RELATED_P (emit_insn (insn)) = 1;
+
+      loongarch_emit_stack_tie ();
+    }
+
+  /* Allocate the rest of the frame.  */
+  if (size > 0)
+    {
+      if (IMM12_OPERAND (-size))
+	{
+	  insn = gen_add3_insn (stack_pointer_rtx, stack_pointer_rtx,
+				GEN_INT (-size));
+	  RTX_FRAME_RELATED_P (emit_insn (insn)) = 1;
+	}
+      else
+	{
+	  loongarch_emit_move (LARCH_PROLOGUE_TEMP (Pmode), GEN_INT (-size));
+	  emit_insn (gen_add3_insn (stack_pointer_rtx, stack_pointer_rtx,
+				    LARCH_PROLOGUE_TEMP (Pmode)));
+
+	  /* Describe the effect of the previous instructions.  */
+	  insn = plus_constant (Pmode, stack_pointer_rtx, -size);
+	  insn = gen_rtx_SET (stack_pointer_rtx, insn);
+	  loongarch_set_frame_expr (insn);
+	}
+    }
+}
+
+/* Return nonzero if this function is known to have a null epilogue.
+   This allows the optimizer to omit jumps to jumps if no stack
+   was created.  */
+
+bool
+loongarch_can_use_return_insn (void)
+{
+  return reload_completed && cfun->machine->frame.total_size == 0;
+}
+
+/* Expand an "epilogue" or "sibcall_epilogue" pattern; SIBCALL_P
+   says which.  */
+
+void
+loongarch_expand_epilogue (bool sibcall_p)
+{
+  /* Split the frame into two.  STEP1 is the amount of stack we should
+     deallocate before restoring the registers.  STEP2 is the amount we
+     should deallocate afterwards.
+
+     Start off by assuming that no registers need to be restored.  */
+  struct loongarch_frame_info *frame = &cfun->machine->frame;
+  HOST_WIDE_INT step1 = frame->total_size;
+  HOST_WIDE_INT step2 = 0;
+  rtx ra = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
+  rtx insn;
+
+  /* We need to add memory barrier to prevent read from deallocated stack.  */
+  bool need_barrier_p
+    = (get_frame_size () + cfun->machine->frame.arg_pointer_offset) != 0;
+
+  if (!sibcall_p && loongarch_can_use_return_insn ())
+    {
+      emit_jump_insn (gen_return ());
+      return;
+    }
+
+  /* Move past any dynamic stack allocations.  */
+  if (cfun->calls_alloca)
+    {
+      /* Emit a barrier to prevent loads from a deallocated stack.  */
+      loongarch_emit_stack_tie ();
+      need_barrier_p = false;
+
+      rtx adjust = GEN_INT (-frame->hard_frame_pointer_offset);
+      if (!IMM12_OPERAND (INTVAL (adjust)))
+	{
+	  loongarch_emit_move (LARCH_PROLOGUE_TEMP (Pmode), adjust);
+	  adjust = LARCH_PROLOGUE_TEMP (Pmode);
+	}
+
+      insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
+				       hard_frame_pointer_rtx,
+				       adjust));
+
+      rtx dwarf = NULL_RTX;
+      rtx minus_offset = GEN_INT (-frame->hard_frame_pointer_offset);
+      rtx cfa_adjust_value = gen_rtx_PLUS (Pmode,
+					   hard_frame_pointer_rtx,
+					   minus_offset);
+
+      rtx cfa_adjust_rtx = gen_rtx_SET (stack_pointer_rtx, cfa_adjust_value);
+      dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, cfa_adjust_rtx, dwarf);
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      REG_NOTES (insn) = dwarf;
+    }
+
+  /* If we need to restore registers, deallocate as much stack as
+     possible in the second step without going out of range.  */
+  if ((frame->mask | frame->fmask) != 0)
+    {
+      step2 = loongarch_first_stack_step (frame);
+      step1 -= step2;
+    }
+
+  /* Set TARGET to BASE + STEP1.  */
+  if (step1 > 0)
+    {
+      /* Emit a barrier to prevent loads from a deallocated stack.  */
+      loongarch_emit_stack_tie ();
+      need_barrier_p = false;
+
+      /* Get an rtx for STEP1 that we can add to BASE.  */
+      rtx adjust = GEN_INT (step1);
+      if (!IMM12_OPERAND (step1))
+	{
+	  loongarch_emit_move (LARCH_PROLOGUE_TEMP (Pmode), adjust);
+	  adjust = LARCH_PROLOGUE_TEMP (Pmode);
+	}
+
+      insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
+				       stack_pointer_rtx,
+				       adjust));
+
+      rtx dwarf = NULL_RTX;
+      rtx cfa_adjust_rtx = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+					 GEN_INT (step2));
+
+      dwarf = alloc_reg_note (REG_CFA_DEF_CFA, cfa_adjust_rtx, dwarf);
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      REG_NOTES (insn) = dwarf;
+    }
+
+  /* Restore the registers.  */
+  loongarch_for_each_saved_reg (frame->total_size - step2,
+				loongarch_restore_reg);
+
+  if (need_barrier_p)
+    loongarch_emit_stack_tie ();
+
+  /* Deallocate the final bit of the frame.  */
+  if (step2 > 0)
+    {
+      insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
+				       stack_pointer_rtx,
+				       GEN_INT (step2)));
+
+      rtx dwarf = NULL_RTX;
+      rtx cfa_adjust_rtx = gen_rtx_PLUS (Pmode, stack_pointer_rtx, const0_rtx);
+      dwarf = alloc_reg_note (REG_CFA_DEF_CFA, cfa_adjust_rtx, dwarf);
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      REG_NOTES (insn) = dwarf;
+    }
+
+  /* Add in the __builtin_eh_return stack adjustment.  */
+  if (crtl->calls_eh_return)
+    emit_insn (gen_add3_insn (stack_pointer_rtx, stack_pointer_rtx,
+			      EH_RETURN_STACKADJ_RTX));
+
+  if (!sibcall_p)
+    emit_jump_insn (gen_simple_return_internal (ra));
+}
+
+#define LU32I_B (0xfffffULL << 32)
+#define LU52I_B (0xfffULL << 52)
+
+/* Fill CODES with a sequence of rtl operations to load VALUE.
+   Return the number of operations needed.  */
+
+static unsigned int
+loongarch_build_integer (struct loongarch_integer_op *codes,
+			 HOST_WIDE_INT value)
+
+{
+  unsigned int cost = 0;
+
+  /* Get the lower 32 bits of the value.  */
+  HOST_WIDE_INT low_part = TARGET_64BIT ? value << 32 >> 32 : value;
+
+  if (IMM12_OPERAND (low_part) || IMM12_OPERAND_UNSIGNED (low_part))
+    {
+      /* The value of the lower 32 bit be loaded with one instruction.
+	 lu12i.w.  */
+      codes[0].code = UNKNOWN;
+      codes[0].method = METHOD_NORMAL;
+      codes[0].value = low_part;
+      cost++;
+    }
+  else
+    {
+      /* lu12i.w + ior.  */
+      codes[0].code = UNKNOWN;
+      codes[0].method = METHOD_NORMAL;
+      codes[0].value = low_part & ~(IMM_REACH - 1);
+      cost++;
+      HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
+      if (iorv != 0)
+	{
+	  codes[1].code = IOR;
+	  codes[1].method = METHOD_NORMAL;
+	  codes[1].value = iorv;
+	  cost++;
+	}
+    }
+
+  if (TARGET_64BIT)
+    {
+      bool lu32i[2] = {(value & LU32I_B) == 0, (value & LU32I_B) == LU32I_B};
+      bool lu52i[2] = {(value & LU52I_B) == 0, (value & LU52I_B) == LU52I_B};
+
+      int sign31 = (value & (1UL << 31)) >> 31;
+      /* Determine whether the upper 32 bits are sign-extended from the lower
+	 32 bits. If it is, the instructions to load the high order can be
+	 ommitted.  */
+      if (lu32i[sign31] && lu52i[sign31])
+	return cost;
+      /* Determine whether bits 32-51 are sign-extended from the lower 32
+	 bits. If so, directly load 52-63 bits.  */
+      else if (lu32i[sign31])
+	{
+	  codes[cost].method = METHOD_LU52I;
+	  codes[cost].value = (value >> 52) << 52;
+	  return cost + 1;
+	}
+
+      codes[cost].method = METHOD_LU32I;
+      codes[cost].value = ((value << 12) >> 44) << 32;
+      cost++;
+
+      /* Determine whether the 52-61 bits are sign-extended from the low order,
+	 and if not, load the 52-61 bits.  */
+      if (!lu52i[(value & (1ULL << 51)) >> 51])
+	{
+	  codes[cost].method = METHOD_LU52I;
+	  codes[cost].value = (value >> 52) << 52;
+	  cost++;
+	}
+    }
+
+  gcc_assert (cost <= LARCH_MAX_INTEGER_OPS);
+
+  return cost;
+}
+
+/* Fill CODES with a sequence of rtl operations to load VALUE.
+   Return the number of operations needed.
+   Split interger in loongarch_output_move.  */
+
+static unsigned int
+loongarch_integer_cost (HOST_WIDE_INT value)
+{
+  struct loongarch_integer_op codes[LARCH_MAX_INTEGER_OPS];
+  return loongarch_build_integer (codes, value);
+}
+
+/* Implement TARGET_LEGITIMATE_CONSTANT_P.  */
+
+static bool
+loongarch_legitimate_constant_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
+{
+  return loongarch_const_insns (x) > 0;
+}
+
+/* Return true if X is a thread-local symbol.  */
+
+static bool
+loongarch_tls_symbol_p (rtx x)
+{
+  return SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0;
+}
+
+/* Return true if SYMBOL_REF X is associated with a global symbol
+   (in the STB_GLOBAL sense).  */
+
+bool
+loongarch_global_symbol_p (const_rtx x)
+{
+  if (LABEL_REF_P (x))
+    return false;
+
+  const_tree decl = SYMBOL_REF_DECL (x);
+
+  if (!decl)
+    return !SYMBOL_REF_LOCAL_P (x) || SYMBOL_REF_EXTERNAL_P (x);
+
+  /* Weakref symbols are not TREE_PUBLIC, but their targets are global
+     or weak symbols.  Relocations in the object file will be against
+     the target symbol, so it's that symbol's binding that matters here.  */
+  return DECL_P (decl) && (TREE_PUBLIC (decl) || DECL_WEAK (decl));
+}
+
+bool
+loongarch_global_symbol_noweak_p (const_rtx x)
+{
+  if (LABEL_REF_P (x))
+    return false;
+
+  const_tree decl = SYMBOL_REF_DECL (x);
+
+  if (!decl)
+    return !SYMBOL_REF_LOCAL_P (x) || SYMBOL_REF_EXTERNAL_P (x);
+
+  return DECL_P (decl) && TREE_PUBLIC (decl);
+}
+
+bool
+loongarch_weak_symbol_p (const_rtx x)
+{
+  const_tree decl;
+  if (LABEL_REF_P (x) || !(decl = SYMBOL_REF_DECL (x)))
+    return false;
+  return DECL_P (decl) && DECL_WEAK (decl);
+}
+
+/* Return true if SYMBOL_REF X binds locally.  */
+
+bool
+loongarch_symbol_binds_local_p (const_rtx x)
+{
+  if (LABEL_REF_P (x))
+    return false;
+
+  return (SYMBOL_REF_DECL (x) ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
+			      : SYMBOL_REF_LOCAL_P (x));
+}
+
+/* Return true if rtx constants of mode MODE should be put into a small
+   data section.  */
+
+static bool
+loongarch_rtx_constant_in_small_data_p (machine_mode mode)
+{
+  return (GET_MODE_SIZE (mode) <= g_switch_value);
+}
+
+/* Return the method that should be used to access SYMBOL_REF or
+   LABEL_REF X.  */
+
+static enum loongarch_symbol_type
+loongarch_classify_symbol (const_rtx x)
+{
+  if (LABEL_REF_P (x))
+    return SYMBOL_GOT_DISP;
+
+  gcc_assert (SYMBOL_REF_P (x));
+
+  if (SYMBOL_REF_TLS_MODEL (x))
+    return SYMBOL_TLS;
+
+  if (SYMBOL_REF_P (x))
+    return SYMBOL_GOT_DISP;
+
+  return SYMBOL_GOT_DISP;
+}
+
+/* Return true if X is a symbolic constant.  If it is,
+   store the type of the symbol in *SYMBOL_TYPE.  */
+
+bool
+loongarch_symbolic_constant_p (rtx x, enum loongarch_symbol_type *symbol_type)
+{
+  rtx offset;
+
+  split_const (x, &x, &offset);
+  if (UNSPEC_ADDRESS_P (x))
+    {
+      *symbol_type = UNSPEC_ADDRESS_TYPE (x);
+      x = UNSPEC_ADDRESS (x);
+    }
+  else if (SYMBOL_REF_P (x) || LABEL_REF_P (x))
+    {
+      *symbol_type = loongarch_classify_symbol (x);
+      if (*symbol_type == SYMBOL_TLS)
+	return true;
+    }
+  else
+    return false;
+
+  if (offset == const0_rtx)
+    return true;
+
+  /* Check whether a nonzero offset is valid for the underlying
+     relocations.  */
+  switch (*symbol_type)
+    {
+    case SYMBOL_GOT_DISP:
+    case SYMBOL_TLSGD:
+    case SYMBOL_TLSLDM:
+    case SYMBOL_TLS:
+      return false;
+    }
+  gcc_unreachable ();
+}
+
+/* Returns the number of instructions necessary to reference a symbol.  */
+
+static int
+loongarch_symbol_insns (enum loongarch_symbol_type type, machine_mode mode)
+{
+  switch (type)
+    {
+    case SYMBOL_GOT_DISP:
+      /* The constant will have to be loaded from the GOT before it
+	 is used in an address.  */
+      if (mode != MAX_MACHINE_MODE)
+	return 0;
+
+      return 3;
+
+    case SYMBOL_TLSGD:
+    case SYMBOL_TLSLDM:
+      return 1;
+
+    case SYMBOL_TLS:
+      /* We don't treat a bare TLS symbol as a constant.  */
+      return 0;
+    }
+  gcc_unreachable ();
+}
+
+/* Implement TARGET_CANNOT_FORCE_CONST_MEM.  */
+
+static bool
+loongarch_cannot_force_const_mem (machine_mode mode, rtx x)
+{
+  enum loongarch_symbol_type type;
+  rtx base, offset;
+
+  /* As an optimization, reject constants that loongarch_legitimize_move
+     can expand inline.
+
+     Suppose we have a multi-instruction sequence that loads constant C
+     into register R.  If R does not get allocated a hard register, and
+     R is used in an operand that allows both registers and memory
+     references, reload will consider forcing C into memory and using
+     one of the instruction's memory alternatives.  Returning false
+     here will force it to use an input reload instead.  */
+  if (CONST_INT_P (x) && loongarch_legitimate_constant_p (mode, x))
+    return true;
+
+  split_const (x, &base, &offset);
+  if (loongarch_symbolic_constant_p (base, &type))
+    {
+      /* The same optimization as for CONST_INT.  */
+      if (IMM12_INT (offset)
+	  && loongarch_symbol_insns (type, MAX_MACHINE_MODE) > 0)
+	return true;
+    }
+
+  /* TLS symbols must be computed by loongarch_legitimize_move.  */
+  if (tls_referenced_p (x))
+    return true;
+
+  return false;
+}
+
+/* Return true if register REGNO is a valid base register for mode MODE.
+   STRICT_P is true if REG_OK_STRICT is in effect.  */
+
+int
+loongarch_regno_mode_ok_for_base_p (int regno,
+				    machine_mode mode ATTRIBUTE_UNUSED,
+				    bool strict_p)
+{
+  if (!HARD_REGISTER_NUM_P (regno))
+    {
+      if (!strict_p)
+	return true;
+      regno = reg_renumber[regno];
+    }
+
+  /* These fake registers will be eliminated to either the stack or
+     hard frame pointer, both of which are usually valid base registers.
+     Reload deals with the cases where the eliminated form isn't valid.  */
+  if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
+    return true;
+
+  return GP_REG_P (regno);
+}
+
+/* Return true if X is a valid base register for mode MODE.
+   STRICT_P is true if REG_OK_STRICT is in effect.  */
+
+static bool
+loongarch_valid_base_register_p (rtx x, machine_mode mode, bool strict_p)
+{
+  if (!strict_p && SUBREG_P (x))
+    x = SUBREG_REG (x);
+
+  return (REG_P (x)
+	  && loongarch_regno_mode_ok_for_base_p (REGNO (x), mode, strict_p));
+}
+
+/* Return true if, for every base register BASE_REG, (plus BASE_REG X)
+   can address a value of mode MODE.  */
+
+static bool
+loongarch_valid_offset_p (rtx x, machine_mode mode)
+{
+  /* Check that X is a signed 12-bit number,
+     or check that X is a signed 16-bit number
+     and offset 4 byte aligned.  */
+  if (!(const_arith_operand (x, Pmode)
+	|| ((mode == E_SImode || mode == E_DImode)
+	    && const_imm16_operand (x, Pmode)
+	    && (loongarch_signed_immediate_p (INTVAL (x), 14, 2)))))
+    return false;
+
+  /* We may need to split multiword moves, so make sure that every word
+     is accessible.  */
+  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD
+      && !IMM12_OPERAND (INTVAL (x) + GET_MODE_SIZE (mode) - UNITS_PER_WORD))
+    return false;
+
+  return true;
+}
+
+static bool
+loongarch_valid_index_p (struct loongarch_address_info *info, rtx x,
+			  machine_mode mode, bool strict_p)
+{
+  rtx index;
+
+  if ((REG_P (x) || SUBREG_P (x))
+      && GET_MODE (x) == Pmode)
+    {
+      index = x;
+    }
+  else
+    return false;
+
+  if (!strict_p
+      && SUBREG_P (index)
+      && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
+    index = SUBREG_REG (index);
+
+  if (loongarch_valid_base_register_p (index, mode, strict_p))
+    {
+      info->type = ADDRESS_REG_REG;
+      info->offset = index;
+      return true;
+    }
+
+  return false;
+}
+
+/* Return true if X is a valid address for machine mode MODE.  If it is,
+   fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
+   effect.  */
+
+static bool
+loongarch_classify_address (struct loongarch_address_info *info, rtx x,
+			    machine_mode mode, bool strict_p)
+{
+  switch (GET_CODE (x))
+    {
+    case REG:
+    case SUBREG:
+      info->type = ADDRESS_REG;
+      info->reg = x;
+      info->offset = const0_rtx;
+      return loongarch_valid_base_register_p (info->reg, mode, strict_p);
+
+    case PLUS:
+      if (loongarch_valid_base_register_p (XEXP (x, 0), mode, strict_p)
+	  && loongarch_valid_index_p (info, XEXP (x, 1), mode, strict_p))
+	{
+	  info->reg = XEXP (x, 0);
+	  return true;
+	}
+
+      if (loongarch_valid_base_register_p (XEXP (x, 1), mode, strict_p)
+	 && loongarch_valid_index_p (info, XEXP (x, 0), mode, strict_p))
+	{
+	  info->reg = XEXP (x, 1);
+	  return true;
+	}
+
+      info->type = ADDRESS_REG;
+      info->reg = XEXP (x, 0);
+      info->offset = XEXP (x, 1);
+      return (loongarch_valid_base_register_p (info->reg, mode, strict_p)
+	      && loongarch_valid_offset_p (info->offset, mode));
+    default:
+      return false;
+    }
+}
+
+/* Implement TARGET_LEGITIMATE_ADDRESS_P.  */
+
+static bool
+loongarch_legitimate_address_p (machine_mode mode, rtx x, bool strict_p)
+{
+  struct loongarch_address_info addr;
+
+  return loongarch_classify_address (&addr, x, mode, strict_p);
+}
+
+/* Return true if ADDR matches the pattern for the indexed address
+   instruction.  */
+
+static bool
+loongarch_index_address_p (rtx addr, machine_mode mode ATTRIBUTE_UNUSED)
+{
+  if (GET_CODE (addr) != PLUS
+      || !REG_P (XEXP (addr, 0))
+      || !REG_P (XEXP (addr, 1)))
+    return false;
+  return true;
+}
+
+/* Return the number of instructions needed to load or store a value
+   of mode MODE at address X.  Return 0 if X isn't valid for MODE.
+   Assume that multiword moves may need to be split into word moves
+   if MIGHT_SPLIT_P, otherwise assume that a single load or store is
+   enough.  */
+
+int
+loongarch_address_insns (rtx x, machine_mode mode, bool might_split_p)
+{
+  struct loongarch_address_info addr;
+  int factor;
+
+  if (!loongarch_classify_address (&addr, x, mode, false))
+    return 0;
+
+  /* BLKmode is used for single unaligned loads and stores and should
+     not count as a multiword mode.  (GET_MODE_SIZE (BLKmode) is pretty
+     meaningless, so we have to single it out as a special case one way
+     or the other.)  */
+  if (mode != BLKmode && might_split_p)
+    factor = (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+  else
+    factor = 1;
+
+  if (loongarch_classify_address (&addr, x, mode, false))
+    switch (addr.type)
+      {
+      case ADDRESS_REG:
+	return factor;
+
+      case ADDRESS_REG_REG:
+	return factor;
+
+      case ADDRESS_CONST_INT:
+	return factor;
+
+      case ADDRESS_SYMBOLIC:
+	return factor * loongarch_symbol_insns (addr.symbol_type, mode);
+      }
+  return 0;
+}
+
+/* Return true if X fits within an unsigned field of BITS bits that is
+   shifted left SHIFT bits before being used.  */
+
+bool
+loongarch_unsigned_immediate_p (unsigned HOST_WIDE_INT x, int bits,
+				int shift = 0)
+{
+  return (x & ((1 << shift) - 1)) == 0 && x < ((unsigned) 1 << (shift + bits));
+}
+
+/* Return true if X fits within a signed field of BITS bits that is
+   shifted left SHIFT bits before being used.  */
+
+bool
+loongarch_signed_immediate_p (unsigned HOST_WIDE_INT x, int bits,
+			      int shift = 0)
+{
+  x += 1 << (bits + shift - 1);
+  return loongarch_unsigned_immediate_p (x, bits, shift);
+}
+
+/* Return true if X is a legitimate address with a 12-bit offset.
+   MODE is the mode of the value being accessed.  */
+
+bool
+loongarch_12bit_offset_address_p (rtx x, machine_mode mode)
+{
+  struct loongarch_address_info addr;
+
+  return (loongarch_classify_address (&addr, x, mode, false)
+	  && addr.type == ADDRESS_REG
+	  && CONST_INT_P (addr.offset)
+	  && LARCH_U12BIT_OFFSET_P (INTVAL (addr.offset)));
+}
+
+/* Return true if X is a legitimate address with a 14-bit offset shifted 2.
+   MODE is the mode of the value being accessed.  */
+
+bool
+loongarch_14bit_shifted_offset_address_p (rtx x, machine_mode mode)
+{
+  struct loongarch_address_info addr;
+
+  return (loongarch_classify_address (&addr, x, mode, false)
+	  && addr.type == ADDRESS_REG
+	  && CONST_INT_P (addr.offset)
+	  && LARCH_16BIT_OFFSET_P (INTVAL (addr.offset))
+	  && LARCH_SHIFT_2_OFFSET_P (INTVAL (addr.offset)));
+}
+
+bool
+loongarch_base_index_address_p (rtx x, machine_mode mode)
+{
+  struct loongarch_address_info addr;
+
+  return (loongarch_classify_address (&addr, x, mode, false)
+	  && addr.type == ADDRESS_REG_REG
+	  && REG_P (addr.offset));
+}
+
+/* Return the number of instructions needed to load constant X,
+   Return 0 if X isn't a valid constant.  */
+
+int
+loongarch_const_insns (rtx x)
+{
+  enum loongarch_symbol_type symbol_type;
+  rtx offset;
+
+  switch (GET_CODE (x))
+    {
+    case CONST_INT:
+      return loongarch_integer_cost (INTVAL (x));
+
+    case CONST_VECTOR:
+      /* Fall through.  */
+    case CONST_DOUBLE:
+      return x == CONST0_RTX (GET_MODE (x)) ? 1 : 0;
+
+    case CONST:
+      /* See if we can refer to X directly.  */
+      if (loongarch_symbolic_constant_p (x, &symbol_type))
+	return loongarch_symbol_insns (symbol_type, MAX_MACHINE_MODE);
+
+      /* Otherwise try splitting the constant into a base and offset.
+	 If the offset is a 12-bit value, we can load the base address
+	 into a register and then use ADDI.{W/D} to add in the offset.
+	 If the offset is larger, we can load the base and offset
+	 into separate registers and add them together with ADD.{W/D}.
+	 However, the latter is only possible before reload; during
+	 and after reload, we must have the option of forcing the
+	 constant into the pool instead.  */
+      split_const (x, &x, &offset);
+      if (offset != 0)
+	{
+	  int n = loongarch_const_insns (x);
+	  if (n != 0)
+	    {
+	      if (IMM12_INT (offset))
+		return n + 1;
+	      else if (!targetm.cannot_force_const_mem (GET_MODE (x), x))
+		return n + 1 + loongarch_integer_cost (INTVAL (offset));
+	    }
+	}
+      return 0;
+
+    case SYMBOL_REF:
+    case LABEL_REF:
+      return loongarch_symbol_insns (
+	loongarch_classify_symbol (x), MAX_MACHINE_MODE);
+
+    default:
+      return 0;
+    }
+}
+
+/* X is a doubleword constant that can be handled by splitting it into
+   two words and loading each word separately.  Return the number of
+   instructions required to do this.  */
+
+int
+loongarch_split_const_insns (rtx x)
+{
+  unsigned int low, high;
+
+  low = loongarch_const_insns (loongarch_subword (x, false));
+  high = loongarch_const_insns (loongarch_subword (x, true));
+  gcc_assert (low > 0 && high > 0);
+  return low + high;
+}
+
+/* Return the number of instructions needed to implement INSN,
+   given that it loads from or stores to MEM.  */
+
+int
+loongarch_load_store_insns (rtx mem, rtx_insn *insn)
+{
+  machine_mode mode;
+  bool might_split_p;
+  rtx set;
+
+  gcc_assert (MEM_P (mem));
+  mode = GET_MODE (mem);
+
+  /* Try to prove that INSN does not need to be split.  */
+  might_split_p = GET_MODE_SIZE (mode) > UNITS_PER_WORD;
+  if (might_split_p)
+    {
+      set = single_set (insn);
+      if (set
+	  && !loongarch_split_move_insn_p (SET_DEST (set), SET_SRC (set)))
+	might_split_p = false;
+    }
+
+  return loongarch_address_insns (XEXP (mem, 0), mode, might_split_p);
+}
+
+/* Return the number of instructions needed for an integer division.  */
+
+int
+loongarch_idiv_insns (machine_mode mode ATTRIBUTE_UNUSED)
+{
+  int count;
+
+  count = 1;
+  if (TARGET_CHECK_ZERO_DIV)
+    count += 2;
+
+  return count;
+}
+
+/* Emit an instruction of the form (set TARGET (CODE OP0 OP1)).  */
+
+void
+loongarch_emit_binary (enum rtx_code code, rtx target, rtx op0, rtx op1)
+{
+  emit_insn (gen_rtx_SET (target, gen_rtx_fmt_ee (code, GET_MODE (target),
+						  op0, op1)));
+}
+
+/* Compute (CODE OP0 OP1) and store the result in a new register
+   of mode MODE.  Return that new register.  */
+
+static rtx
+loongarch_force_binary (machine_mode mode, enum rtx_code code, rtx op0,
+			rtx op1)
+{
+  rtx reg;
+
+  reg = gen_reg_rtx (mode);
+  loongarch_emit_binary (code, reg, op0, op1);
+  return reg;
+}
+
+/* Copy VALUE to a register and return that register.  If new pseudos
+   are allowed, copy it into a new register, otherwise use DEST.  */
+
+static rtx
+loongarch_force_temporary (rtx dest, rtx value)
+{
+  if (can_create_pseudo_p ())
+    return force_reg (Pmode, value);
+  else
+    {
+      loongarch_emit_move (dest, value);
+      return dest;
+    }
+}
+
+/* Wrap symbol or label BASE in an UNSPEC address of type SYMBOL_TYPE,
+   then add CONST_INT OFFSET to the result.  */
+
+static rtx
+loongarch_unspec_address_offset (rtx base, rtx offset,
+				 enum loongarch_symbol_type symbol_type)
+{
+  base = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, base),
+			 UNSPEC_ADDRESS_FIRST + symbol_type);
+  if (offset != const0_rtx)
+    base = gen_rtx_PLUS (Pmode, base, offset);
+  return gen_rtx_CONST (Pmode, base);
+}
+
+/* Return an UNSPEC address with underlying address ADDRESS and symbol
+   type SYMBOL_TYPE.  */
+
+rtx
+loongarch_unspec_address (rtx address, enum loongarch_symbol_type symbol_type)
+{
+  rtx base, offset;
+
+  split_const (address, &base, &offset);
+  return loongarch_unspec_address_offset (base, offset, symbol_type);
+}
+
+/* If OP is an UNSPEC address, return the address to which it refers,
+   otherwise return OP itself.  */
+
+rtx
+loongarch_strip_unspec_address (rtx op)
+{
+  rtx base, offset;
+
+  split_const (op, &base, &offset);
+  if (UNSPEC_ADDRESS_P (base))
+    op = plus_constant (Pmode, UNSPEC_ADDRESS (base), INTVAL (offset));
+  return op;
+}
+
+/* Return a legitimate address for REG + OFFSET.  TEMP is as for
+   loongarch_force_temporary; it is only needed when OFFSET is not a
+   IMM12_OPERAND.  */
+
+static rtx
+loongarch_add_offset (rtx temp, rtx reg, HOST_WIDE_INT offset)
+{
+  if (!IMM12_OPERAND (offset))
+    {
+      rtx high;
+
+      /* Leave OFFSET as a 12-bit offset and put the excess in HIGH.
+	 The addition inside the macro CONST_HIGH_PART may cause an
+	 overflow, so we need to force a sign-extension check.  */
+      high = gen_int_mode (CONST_HIGH_PART (offset), Pmode);
+      offset = CONST_LOW_PART (offset);
+      high = loongarch_force_temporary (temp, high);
+      reg = loongarch_force_temporary (temp, gen_rtx_PLUS (Pmode, high, reg));
+    }
+  return plus_constant (Pmode, reg, offset);
+}
+
+/* The __tls_get_attr symbol.  */
+static GTY (()) rtx loongarch_tls_symbol;
+
+/* Load an entry from the GOT for a TLS GD access.  */
+
+static rtx
+loongarch_got_load_tls_gd (rtx dest, rtx sym)
+{
+  return gen_got_load_tls_gd (Pmode, dest, sym);
+}
+
+/* Load an entry from the GOT for a TLS LD access.  */
+
+static rtx
+loongarch_got_load_tls_ld (rtx dest, rtx sym)
+{
+  return gen_got_load_tls_ld (Pmode, dest, sym);
+}
+
+/* Load an entry from the GOT for a TLS IE access.  */
+
+static rtx
+loongarch_got_load_tls_ie (rtx dest, rtx sym)
+{
+  return gen_got_load_tls_ie (Pmode, dest, sym);
+}
+
+/* Add in the thread pointer for a TLS LE access.  */
+
+static rtx
+loongarch_got_load_tls_le (rtx dest, rtx sym)
+{
+  return gen_got_load_tls_le (Pmode, dest, sym);
+}
+
+/* Return an instruction sequence that calls __tls_get_addr.  SYM is
+   the TLS symbol we are referencing and TYPE is the symbol type to use
+   (either global dynamic or local dynamic).  V0 is an RTX for the
+   return value location.  */
+
+static rtx_insn *
+loongarch_call_tls_get_addr (rtx sym, enum loongarch_symbol_type type, rtx v0)
+{
+  rtx loc, a0;
+  rtx_insn *insn;
+
+  a0 = gen_rtx_REG (Pmode, GP_ARG_FIRST);
+
+  if (!loongarch_tls_symbol)
+    loongarch_tls_symbol = init_one_libfunc ("__tls_get_addr");
+
+  loc = loongarch_unspec_address (sym, type);
+
+  start_sequence ();
+
+  if (type == SYMBOL_TLSLDM)
+    emit_insn (loongarch_got_load_tls_ld (a0, loc));
+  else if (type == SYMBOL_TLSGD)
+    emit_insn (loongarch_got_load_tls_gd (a0, loc));
+  else
+    gcc_unreachable ();
+
+  insn = emit_call_insn (gen_call_value_internal (v0, loongarch_tls_symbol,
+						  const0_rtx));
+  RTL_CONST_CALL_P (insn) = 1;
+  use_reg (&CALL_INSN_FUNCTION_USAGE (insn), a0);
+  insn = get_insns ();
+
+  end_sequence ();
+
+  return insn;
+}
+
+/* Generate the code to access LOC, a thread-local SYMBOL_REF, and return
+   its address.  The return value will be both a valid address and a valid
+   SET_SRC (either a REG or a LO_SUM).  */
+
+static rtx
+loongarch_legitimize_tls_address (rtx loc)
+{
+  rtx dest, tp, tmp;
+  enum tls_model model = SYMBOL_REF_TLS_MODEL (loc);
+  rtx_insn *insn;
+
+  switch (model)
+    {
+    case TLS_MODEL_LOCAL_DYNAMIC:
+      tmp = gen_rtx_REG (Pmode, GP_RETURN);
+      dest = gen_reg_rtx (Pmode);
+      insn = loongarch_call_tls_get_addr (loc, SYMBOL_TLSLDM, tmp);
+      emit_libcall_block (insn, dest, tmp, loc);
+      break;
+
+    case TLS_MODEL_GLOBAL_DYNAMIC:
+      tmp = gen_rtx_REG (Pmode, GP_RETURN);
+      dest = gen_reg_rtx (Pmode);
+      insn = loongarch_call_tls_get_addr (loc, SYMBOL_TLSGD, tmp);
+      emit_libcall_block (insn, dest, tmp, loc);
+      break;
+
+    case TLS_MODEL_INITIAL_EXEC:
+      /* la.tls.ie; tp-relative add  */
+      tp = gen_rtx_REG (Pmode, THREAD_POINTER_REGNUM);
+      tmp = gen_reg_rtx (Pmode);
+      emit_insn (loongarch_got_load_tls_ie (tmp, loc));
+      dest = gen_reg_rtx (Pmode);
+      emit_insn (gen_add3_insn (dest, tmp, tp));
+      break;
+
+    case TLS_MODEL_LOCAL_EXEC:
+      /* la.tls.le; tp-relative add  */
+      tp = gen_rtx_REG (Pmode, THREAD_POINTER_REGNUM);
+      tmp = gen_reg_rtx (Pmode);
+      emit_insn (loongarch_got_load_tls_le (tmp, loc));
+      dest = gen_reg_rtx (Pmode);
+      emit_insn (gen_add3_insn (dest, tmp, tp));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+  return dest;
+}
+
+rtx
+loongarch_legitimize_call_address (rtx addr)
+{
+  if (!call_insn_operand (addr, VOIDmode))
+    {
+      rtx reg = gen_reg_rtx (Pmode);
+      loongarch_emit_move (reg, addr);
+      return reg;
+    }
+  return addr;
+}
+
+/* If X is a PLUS of a CONST_INT, return the two terms in *BASE_PTR
+   and *OFFSET_PTR.  Return X in *BASE_PTR and 0 in *OFFSET_PTR otherwise.  */
+
+static void
+loongarch_split_plus (rtx x, rtx *base_ptr, HOST_WIDE_INT *offset_ptr)
+{
+  if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
+    {
+      *base_ptr = XEXP (x, 0);
+      *offset_ptr = INTVAL (XEXP (x, 1));
+    }
+  else
+    {
+      *base_ptr = x;
+      *offset_ptr = 0;
+    }
+}
+
+/* If X is not a valid address for mode MODE, force it into a register.  */
+
+static rtx
+loongarch_force_address (rtx x, machine_mode mode)
+{
+  if (!loongarch_legitimate_address_p (mode, x, false))
+    x = force_reg (Pmode, x);
+  return x;
+}
+
+/* This function is used to implement LEGITIMIZE_ADDRESS.  If X can
+   be legitimized in a way that the generic machinery might not expect,
+   return a new address, otherwise return NULL.  MODE is the mode of
+   the memory being accessed.  */
+
+static rtx
+loongarch_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
+			      machine_mode mode)
+{
+  rtx base, addr;
+  HOST_WIDE_INT offset;
+
+  if (loongarch_tls_symbol_p (x))
+    return loongarch_legitimize_tls_address (x);
+
+  /* Handle BASE + OFFSET using loongarch_add_offset.  */
+  loongarch_split_plus (x, &base, &offset);
+  if (offset != 0)
+    {
+      if (!loongarch_valid_base_register_p (base, mode, false))
+	base = copy_to_mode_reg (Pmode, base);
+      addr = loongarch_add_offset (NULL, base, offset);
+      return loongarch_force_address (addr, mode);
+    }
+
+  return x;
+}
+
+/* Load VALUE into DEST.  TEMP is as for loongarch_force_temporary.  */
+
+void
+loongarch_move_integer (rtx temp, rtx dest, unsigned HOST_WIDE_INT value)
+{
+  struct loongarch_integer_op codes[LARCH_MAX_INTEGER_OPS];
+  machine_mode mode;
+  unsigned int i, num_ops;
+  rtx x;
+
+  mode = GET_MODE (dest);
+  num_ops = loongarch_build_integer (codes, value);
+
+  /* Apply each binary operation to X.  Invariant: X is a legitimate
+     source operand for a SET pattern.  */
+  x = GEN_INT (codes[0].value);
+  for (i = 1; i < num_ops; i++)
+    {
+      if (!can_create_pseudo_p ())
+	{
+	  emit_insn (gen_rtx_SET (temp, x));
+	  x = temp;
+	}
+      else
+	x = force_reg (mode, x);
+
+      switch (codes[i].method)
+	{
+	case METHOD_NORMAL:
+	  x = gen_rtx_fmt_ee (codes[i].code, mode, x,
+			      GEN_INT (codes[i].value));
+	  break;
+	case METHOD_LU32I:
+	  emit_insn (
+	    gen_rtx_SET (x,
+			 gen_rtx_IOR (DImode,
+				      gen_rtx_ZERO_EXTEND (
+					DImode, gen_rtx_SUBREG (SImode, x, 0)),
+				      GEN_INT (codes[i].value))));
+	  break;
+	case METHOD_LU52I:
+	  emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
+				  GEN_INT (codes[i].value)));
+	  break;
+	case METHOD_INSV:
+	  emit_insn (
+	    gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT (20),
+					       GEN_INT (32)),
+			 gen_rtx_REG (DImode, 0)));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
+  emit_insn (gen_rtx_SET (dest, x));
+}
+
+/* Subroutine of loongarch_legitimize_move.  Move constant SRC into register
+   DEST given that SRC satisfies immediate_operand but doesn't satisfy
+   move_operand.  */
+
+static void
+loongarch_legitimize_const_move (machine_mode mode, rtx dest, rtx src)
+{
+  rtx base, offset;
+
+  /* Split moves of big integers into smaller pieces.  */
+  if (splittable_const_int_operand (src, mode))
+    {
+      loongarch_move_integer (dest, dest, INTVAL (src));
+      return;
+    }
+
+  /* Generate the appropriate access sequences for TLS symbols.  */
+  if (loongarch_tls_symbol_p (src))
+    {
+      loongarch_emit_move (dest, loongarch_legitimize_tls_address (src));
+      return;
+    }
+
+  /* If we have (const (plus symbol offset)), and that expression cannot
+     be forced into memory, load the symbol first and add in the offset.
+     prefer to do this even if the constant _can_ be forced into memory,
+     as it usually produces better code.  */
+  split_const (src, &base, &offset);
+  if (offset != const0_rtx
+      && (targetm.cannot_force_const_mem (mode, src)
+	  || (can_create_pseudo_p ())))
+    {
+      base = loongarch_force_temporary (dest, base);
+      loongarch_emit_move (dest,
+			   loongarch_add_offset (NULL, base, INTVAL (offset)));
+      return;
+    }
+
+  src = force_const_mem (mode, src);
+
+  loongarch_emit_move (dest, src);
+}
+
+/* If (set DEST SRC) is not a valid move instruction, emit an equivalent
+   sequence that is valid.  */
+
+bool
+loongarch_legitimize_move (machine_mode mode, rtx dest, rtx src)
+{
+  if (!register_operand (dest, mode) && !reg_or_0_operand (src, mode))
+    {
+      loongarch_emit_move (dest, force_reg (mode, src));
+      return true;
+    }
+
+  /* Both src and dest are non-registers;  one special case is supported where
+     the source is (const_int 0) and the store can source the zero register.
+     */
+  if (!register_operand (dest, mode) && !register_operand (src, mode)
+      && !const_0_operand (src, mode))
+    {
+      loongarch_emit_move (dest, force_reg (mode, src));
+      return true;
+    }
+
+  /* We need to deal with constants that would be legitimate
+     immediate_operands but aren't legitimate move_operands.  */
+  if (CONSTANT_P (src) && !move_operand (src, mode))
+    {
+      loongarch_legitimize_const_move (mode, dest, src);
+      set_unique_reg_note (get_last_insn (), REG_EQUAL, copy_rtx (src));
+      return true;
+    }
+
+  return false;
+}
+
+/* Return true if OP refers to small data symbols directly.  */
+
+static int
+loongarch_small_data_pattern_1 (rtx x)
+{
+  subrtx_var_iterator::array_type array;
+  FOR_EACH_SUBRTX_VAR (iter, array, x, ALL)
+    {
+      rtx x = *iter;
+
+      /* We make no particular guarantee about which symbolic constants are
+	 acceptable as asm operands versus which must be forced into a GPR.  */
+      if (GET_CODE (x) == ASM_OPERANDS)
+	iter.skip_subrtxes ();
+      else if (MEM_P (x))
+	{
+	  if (loongarch_small_data_pattern_1 (XEXP (x, 0)))
+	    return true;
+	  iter.skip_subrtxes ();
+	}
+    }
+  return false;
+}
+
+/* Return true if OP refers to small data symbols directly.  */
+
+bool
+loongarch_small_data_pattern_p (rtx op)
+{
+  return loongarch_small_data_pattern_1 (op);
+}
+
+/* Rewrite *LOC so that it refers to small data using explicit
+   relocations.  */
+
+static void
+loongarch_rewrite_small_data_1 (rtx *loc)
+{
+  subrtx_ptr_iterator::array_type array;
+  FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
+    {
+      rtx *loc = *iter;
+      if (MEM_P (*loc))
+	{
+	  loongarch_rewrite_small_data_1 (&XEXP (*loc, 0));
+	  iter.skip_subrtxes ();
+	}
+    }
+}
+
+/* Rewrite instruction pattern PATTERN so that it refers to small data
+   using explicit relocations.  */
+
+rtx
+loongarch_rewrite_small_data (rtx pattern)
+{
+  pattern = copy_insn (pattern);
+  loongarch_rewrite_small_data_1 (&pattern);
+  return pattern;
+}
+
+/* The cost of loading values from the constant pool.  It should be
+   larger than the cost of any constant we want to synthesize inline.  */
+#define CONSTANT_POOL_COST COSTS_N_INSNS (8)
+
+/* Return true if there is a instruction that implements CODE
+   and if that instruction accepts X as an immediate operand.  */
+
+static int
+loongarch_immediate_operand_p (int code, HOST_WIDE_INT x)
+{
+  switch (code)
+    {
+    case ASHIFT:
+    case ASHIFTRT:
+    case LSHIFTRT:
+      /* All shift counts are truncated to a valid constant.  */
+      return true;
+
+    case ROTATE:
+    case ROTATERT:
+      return true;
+
+    case AND:
+    case IOR:
+    case XOR:
+      /* These instructions take 12-bit unsigned immediates.  */
+      return IMM12_OPERAND_UNSIGNED (x);
+
+    case PLUS:
+    case LT:
+    case LTU:
+      /* These instructions take 12-bit signed immediates.  */
+      return IMM12_OPERAND (x);
+
+    case EQ:
+    case NE:
+    case GT:
+    case GTU:
+      /* The "immediate" forms of these instructions are really
+	 implemented as comparisons with register 0.  */
+      return x == 0;
+
+    case GE:
+    case GEU:
+      /* Likewise, meaning that the only valid immediate operand is 1.  */
+      return x == 1;
+
+    case LE:
+      /* We add 1 to the immediate and use SLT.  */
+      return IMM12_OPERAND (x + 1);
+
+    case LEU:
+      /* Likewise SLTU, but reject the always-true case.  */
+      return IMM12_OPERAND (x + 1) && x + 1 != 0;
+
+    case SIGN_EXTRACT:
+    case ZERO_EXTRACT:
+      /* The bit position and size are immediate operands.  */
+      return 1;
+
+    default:
+      /* By default assume that $0 can be used for 0.  */
+      return x == 0;
+    }
+}
+
+/* Return the cost of binary operation X, given that the instruction
+   sequence for a word-sized or smaller operation has cost SINGLE_COST
+   and that the sequence of a double-word operation has cost DOUBLE_COST.
+   If SPEED is true, optimize for speed otherwise optimize for size.  */
+
+static int
+loongarch_binary_cost (rtx x, int single_cost, int double_cost, bool speed)
+{
+  int cost;
+
+  if (GET_MODE_SIZE (GET_MODE (x)) == UNITS_PER_WORD * 2)
+    cost = double_cost;
+  else
+    cost = single_cost;
+  return (cost
+	  + set_src_cost (XEXP (x, 0), GET_MODE (x), speed)
+	  + rtx_cost (XEXP (x, 1), GET_MODE (x), GET_CODE (x), 1, speed));
+}
+
+/* Return the cost of floating-point multiplications of mode MODE.  */
+
+static int
+loongarch_fp_mult_cost (machine_mode mode)
+{
+  return mode == DFmode ? loongarch_cost->fp_mult_df
+			: loongarch_cost->fp_mult_sf;
+}
+
+/* Return the cost of floating-point divisions of mode MODE.  */
+
+static int
+loongarch_fp_div_cost (machine_mode mode)
+{
+  return mode == DFmode ? loongarch_cost->fp_div_df
+			: loongarch_cost->fp_div_sf;
+}
+
+/* Return the cost of sign-extending OP to mode MODE, not including the
+   cost of OP itself.  */
+
+static int
+loongarch_sign_extend_cost (rtx op)
+{
+  if (MEM_P (op))
+    /* Extended loads are as cheap as unextended ones.  */
+    return 0;
+
+  return COSTS_N_INSNS (1);
+}
+
+/* Return the cost of zero-extending OP to mode MODE, not including the
+   cost of OP itself.  */
+
+static int
+loongarch_zero_extend_cost (rtx op)
+{
+  if (MEM_P (op))
+    /* Extended loads are as cheap as unextended ones.  */
+    return 0;
+
+  /* We can use ANDI.  */
+  return COSTS_N_INSNS (1);
+}
+
+/* Return the cost of moving between two registers of mode MODE,
+   assuming that the move will be in pieces of at most UNITS bytes.  */
+
+static int
+loongarch_set_reg_reg_piece_cost (machine_mode mode, unsigned int units)
+{
+  return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
+}
+
+/* Return the cost of moving between two registers of mode MODE.  */
+
+static int
+loongarch_set_reg_reg_cost (machine_mode mode)
+{
+  switch (GET_MODE_CLASS (mode))
+    {
+    case MODE_CC:
+      return loongarch_set_reg_reg_piece_cost (mode, GET_MODE_SIZE (CCmode));
+
+    case MODE_FLOAT:
+    case MODE_COMPLEX_FLOAT:
+    case MODE_VECTOR_FLOAT:
+      if (TARGET_HARD_FLOAT)
+	return loongarch_set_reg_reg_piece_cost (mode, UNITS_PER_HWFPVALUE);
+      /* Fall through.  */
+
+    default:
+      return loongarch_set_reg_reg_piece_cost (mode, UNITS_PER_WORD);
+    }
+}
+
+/* Implement TARGET_RTX_COSTS.  */
+
+static bool
+loongarch_rtx_costs (rtx x, machine_mode mode, int outer_code,
+		     int opno ATTRIBUTE_UNUSED, int *total, bool speed)
+{
+  int code = GET_CODE (x);
+  bool float_mode_p = FLOAT_MODE_P (mode);
+  int cost;
+  rtx addr;
+
+  if (outer_code == COMPARE)
+    {
+      gcc_assert (CONSTANT_P (x));
+      *total = 0;
+      return true;
+    }
+
+  switch (code)
+    {
+    case CONST_INT:
+      if (TARGET_64BIT && outer_code == AND && UINTVAL (x) == 0xffffffff)
+	{
+	  *total = 0;
+	  return true;
+	}
+
+      /* When not optimizing for size, we care more about the cost
+	 of hot code, and hot code is often in a loop.  If a constant
+	 operand needs to be forced into a register, we will often be
+	 able to hoist the constant load out of the loop, so the load
+	 should not contribute to the cost.  */
+      if (speed || loongarch_immediate_operand_p (outer_code, INTVAL (x)))
+	{
+	  *total = 0;
+	  return true;
+	}
+      /* Fall through.  */
+
+    case CONST:
+    case SYMBOL_REF:
+    case LABEL_REF:
+    case CONST_DOUBLE:
+      cost = loongarch_const_insns (x);
+      if (cost > 0)
+	{
+	  if (cost == 1 && outer_code == SET
+	      && !(float_mode_p && TARGET_HARD_FLOAT))
+	    cost = 0;
+	  else if ((outer_code == SET || GET_MODE (x) == VOIDmode))
+	    cost = 1;
+	  *total = COSTS_N_INSNS (cost);
+	  return true;
+	}
+      /* The value will need to be fetched from the constant pool.  */
+      *total = CONSTANT_POOL_COST;
+      return true;
+
+    case MEM:
+      /* If the address is legitimate, return the number of
+	 instructions it needs.  */
+      addr = XEXP (x, 0);
+      /* Check for a scaled indexed address.  */
+      if (loongarch_index_address_p (addr, mode))
+	{
+	  *total = COSTS_N_INSNS (2);
+	  return true;
+	}
+      cost = loongarch_address_insns (addr, mode, true);
+      if (cost > 0)
+	{
+	  *total = COSTS_N_INSNS (cost + 1);
+	  return true;
+	}
+      /* Otherwise use the default handling.  */
+      return false;
+
+    case FFS:
+      *total = COSTS_N_INSNS (6);
+      return false;
+
+    case NOT:
+      *total = COSTS_N_INSNS (GET_MODE_SIZE (mode) > UNITS_PER_WORD ? 2 : 1);
+      return false;
+
+    case AND:
+      /* Check for a *clear_upper32 pattern and treat it like a zero
+	 extension.  See the pattern's comment for details.  */
+      if (TARGET_64BIT && mode == DImode && CONST_INT_P (XEXP (x, 1))
+	  && UINTVAL (XEXP (x, 1)) == 0xffffffff)
+	{
+	  *total = (loongarch_zero_extend_cost (XEXP (x, 0))
+		    + set_src_cost (XEXP (x, 0), mode, speed));
+	  return true;
+	}
+      /* (AND (NOT op0) (NOT op1) is a nor operation that can be done in
+	 a single instruction.  */
+      if (GET_CODE (XEXP (x, 0)) == NOT && GET_CODE (XEXP (x, 1)) == NOT)
+	{
+	  cost = GET_MODE_SIZE (mode) > UNITS_PER_WORD ? 2 : 1;
+	  *total = (COSTS_N_INSNS (cost)
+		    + set_src_cost (XEXP (XEXP (x, 0), 0), mode, speed)
+		    + set_src_cost (XEXP (XEXP (x, 1), 0), mode, speed));
+	  return true;
+	}
+
+      /* Fall through.  */
+
+    case IOR:
+    case XOR:
+      /* Double-word operations use two single-word operations.  */
+      *total = loongarch_binary_cost (x, COSTS_N_INSNS (1), COSTS_N_INSNS (2),
+				      speed);
+      return true;
+
+    case ASHIFT:
+    case ASHIFTRT:
+    case LSHIFTRT:
+    case ROTATE:
+    case ROTATERT:
+      if (CONSTANT_P (XEXP (x, 1)))
+	*total = loongarch_binary_cost (x, COSTS_N_INSNS (1),
+					COSTS_N_INSNS (4), speed);
+      else
+	*total = loongarch_binary_cost (x, COSTS_N_INSNS (1),
+					COSTS_N_INSNS (12), speed);
+      return true;
+
+    case ABS:
+      if (float_mode_p)
+	*total = loongarch_cost->fp_add;
+      else
+	*total = COSTS_N_INSNS (4);
+      return false;
+
+    case LT:
+    case LTU:
+    case LE:
+    case LEU:
+    case GT:
+    case GTU:
+    case GE:
+    case GEU:
+    case EQ:
+    case NE:
+    case UNORDERED:
+    case LTGT:
+    case UNGE:
+    case UNGT:
+    case UNLE:
+    case UNLT:
+      /* Branch comparisons have VOIDmode, so use the first operand's
+	 mode instead.  */
+      mode = GET_MODE (XEXP (x, 0));
+      if (FLOAT_MODE_P (mode))
+	{
+	  *total = loongarch_cost->fp_add;
+	  return false;
+	}
+      *total = loongarch_binary_cost (x, COSTS_N_INSNS (1), COSTS_N_INSNS (4),
+				      speed);
+      return true;
+
+    case MINUS:
+    case PLUS:
+      if (float_mode_p)
+	{
+	  *total = loongarch_cost->fp_add;
+	  return false;
+	}
+
+      /* If it's an add + mult (which is equivalent to shift left) and
+	 it's immediate operand satisfies const_immalsl_operand predicate.  */
+      if ((mode == SImode || (TARGET_64BIT && mode == DImode))
+	  && GET_CODE (XEXP (x, 0)) == MULT)
+	{
+	  rtx op2 = XEXP (XEXP (x, 0), 1);
+	  if (const_immalsl_operand (op2, mode))
+	    {
+	      *total = (COSTS_N_INSNS (1)
+			+ set_src_cost (XEXP (XEXP (x, 0), 0), mode, speed)
+			+ set_src_cost (XEXP (x, 1), mode, speed));
+	      return true;
+	    }
+	}
+
+      /* Double-word operations require three single-word operations and
+	 an SLTU.  */
+      *total = loongarch_binary_cost (x, COSTS_N_INSNS (1), COSTS_N_INSNS (4),
+				      speed);
+      return true;
+
+    case NEG:
+      if (float_mode_p)
+	*total = loongarch_cost->fp_add;
+      else
+	*total = COSTS_N_INSNS (GET_MODE_SIZE (mode) > UNITS_PER_WORD ? 4 : 1);
+      return false;
+
+    case FMA:
+      *total = loongarch_fp_mult_cost (mode);
+      return false;
+
+    case MULT:
+      if (float_mode_p)
+	*total = loongarch_fp_mult_cost (mode);
+      else if (mode == DImode && !TARGET_64BIT)
+	*total = (speed
+		  ? loongarch_cost->int_mult_si * 3 + 6
+		  : COSTS_N_INSNS (7));
+      else if (!speed)
+	*total = COSTS_N_INSNS (1) + 1;
+      else if (mode == DImode)
+	*total = loongarch_cost->int_mult_di;
+      else
+	*total = loongarch_cost->int_mult_si;
+      return false;
+
+    case DIV:
+      /* Check for a reciprocal.  */
+      if (float_mode_p
+	  && flag_unsafe_math_optimizations
+	  && XEXP (x, 0) == CONST1_RTX (mode))
+	{
+	  if (outer_code == SQRT || GET_CODE (XEXP (x, 1)) == SQRT)
+	    /* An rsqrt<mode>a or rsqrt<mode>b pattern.  Count the
+	       division as being free.  */
+	    *total = set_src_cost (XEXP (x, 1), mode, speed);
+	  else
+	    *total = (loongarch_fp_div_cost (mode)
+		      + set_src_cost (XEXP (x, 1), mode, speed));
+	  return true;
+	}
+      /* Fall through.  */
+
+    case SQRT:
+    case MOD:
+      if (float_mode_p)
+	{
+	  *total = loongarch_fp_div_cost (mode);
+	  return false;
+	}
+      /* Fall through.  */
+
+    case UDIV:
+    case UMOD:
+      if (!speed)
+	{
+	  *total = COSTS_N_INSNS (loongarch_idiv_insns (mode));
+	}
+      else if (mode == DImode)
+	*total = loongarch_cost->int_div_di;
+      else
+	*total = loongarch_cost->int_div_si;
+      return false;
+
+    case SIGN_EXTEND:
+      *total = loongarch_sign_extend_cost (XEXP (x, 0));
+      return false;
+
+    case ZERO_EXTEND:
+      *total = loongarch_zero_extend_cost (XEXP (x, 0));
+      return false;
+    case TRUNCATE:
+      /* Costings for highpart multiplies.  Matching patterns of the form:
+
+	 (lshiftrt:DI (mult:DI (sign_extend:DI (...)
+			       (sign_extend:DI (...))
+		      (const_int 32)
+      */
+      if ((GET_CODE (XEXP (x, 0)) == ASHIFTRT
+	   || GET_CODE (XEXP (x, 0)) == LSHIFTRT)
+	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+	  && ((INTVAL (XEXP (XEXP (x, 0), 1)) == 32
+	       && GET_MODE (XEXP (x, 0)) == DImode)
+	      || (TARGET_64BIT
+		  && INTVAL (XEXP (XEXP (x, 0), 1)) == 64
+		  && GET_MODE (XEXP (x, 0)) == TImode))
+	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+	  && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
+	       && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND)
+	      || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
+		  && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1))
+		      == ZERO_EXTEND))))
+	{
+	  if (!speed)
+	    *total = COSTS_N_INSNS (1) + 1;
+	  else if (mode == DImode)
+	    *total = loongarch_cost->int_mult_di;
+	  else
+	    *total = loongarch_cost->int_mult_si;
+
+	  /* Sign extension is free, zero extension costs for DImode when
+	     on a 64bit core / when DMUL is present.  */
+	  for (int i = 0; i < 2; ++i)
+	    {
+	      rtx op = XEXP (XEXP (XEXP (x, 0), 0), i);
+	      if (TARGET_64BIT
+		  && GET_CODE (op) == ZERO_EXTEND
+		  && GET_MODE (op) == DImode)
+		*total += rtx_cost (op, DImode, MULT, i, speed);
+	      else
+		*total += rtx_cost (XEXP (op, 0), VOIDmode, GET_CODE (op), 0,
+				    speed);
+	    }
+
+	  return true;
+	}
+      return false;
+
+    case FLOAT:
+    case UNSIGNED_FLOAT:
+    case FIX:
+    case FLOAT_EXTEND:
+    case FLOAT_TRUNCATE:
+      *total = loongarch_cost->fp_add;
+      return false;
+
+    case SET:
+      if (register_operand (SET_DEST (x), VOIDmode)
+	  && reg_or_0_operand (SET_SRC (x), VOIDmode))
+	{
+	  *total = loongarch_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
+	  return true;
+	}
+      return false;
+
+    default:
+      return false;
+    }
+}
+
+/* Implement TARGET_ADDRESS_COST.  */
+
+static int
+loongarch_address_cost (rtx addr, machine_mode mode,
+			addr_space_t as ATTRIBUTE_UNUSED,
+			bool speed ATTRIBUTE_UNUSED)
+{
+  return loongarch_address_insns (addr, mode, false);
+}
+
+/* Return one word of double-word value OP, taking into account the fixed
+   endianness of certain registers.  HIGH_P is true to select the high part,
+   false to select the low part.  */
+
+rtx
+loongarch_subword (rtx op, bool high_p)
+{
+  unsigned int byte;
+  machine_mode mode;
+
+  byte = high_p ? UNITS_PER_WORD : 0;
+  mode = GET_MODE (op);
+  if (mode == VOIDmode)
+    mode = TARGET_64BIT ? TImode : DImode;
+
+  if (FP_REG_RTX_P (op))
+    return gen_rtx_REG (word_mode, REGNO (op) + high_p);
+
+  if (MEM_P (op))
+    return loongarch_rewrite_small_data (adjust_address (op, word_mode, byte));
+
+  return simplify_gen_subreg (word_mode, op, mode, byte);
+}
+
+/* Return true if a move from SRC to DEST should be split into two.
+   SPLIT_TYPE describes the split condition.  */
+
+bool
+loongarch_split_move_p (rtx dest, rtx src)
+{
+  /* FPR-to-FPR moves can be done in a single instruction, if they're
+     allowed at all.  */
+  unsigned int size = GET_MODE_SIZE (GET_MODE (dest));
+  if (size == 8 && FP_REG_RTX_P (src) && FP_REG_RTX_P (dest))
+    return false;
+
+  /* Check for floating-point loads and stores.  */
+  if (size == 8)
+    {
+      if (FP_REG_RTX_P (dest) && MEM_P (src))
+	return false;
+      if (FP_REG_RTX_P (src) && MEM_P (dest))
+	return false;
+    }
+  /* Otherwise split all multiword moves.  */
+  return size > UNITS_PER_WORD;
+}
+
+/* Split a move from SRC to DEST, given that loongarch_split_move_p holds.
+   SPLIT_TYPE describes the split condition.  */
+
+void
+loongarch_split_move (rtx dest, rtx src, rtx insn_)
+{
+  rtx low_dest;
+
+  gcc_checking_assert (loongarch_split_move_p (dest, src));
+  if (FP_REG_RTX_P (dest) || FP_REG_RTX_P (src))
+    {
+      if (!TARGET_64BIT && GET_MODE (dest) == DImode)
+	emit_insn (gen_move_doubleword_fprdi (dest, src));
+      else if (!TARGET_64BIT && GET_MODE (dest) == DFmode)
+	emit_insn (gen_move_doubleword_fprdf (dest, src));
+      else if (TARGET_64BIT && GET_MODE (dest) == TFmode)
+	emit_insn (gen_move_doubleword_fprtf (dest, src));
+      else
+	gcc_unreachable ();
+    }
+  else
+    {
+      /* The operation can be split into two normal moves.  Decide in
+	 which order to do them.  */
+      low_dest = loongarch_subword (dest, false);
+      if (REG_P (low_dest) && reg_overlap_mentioned_p (low_dest, src))
+	{
+	  loongarch_emit_move (loongarch_subword (dest, true),
+			       loongarch_subword (src, true));
+	  loongarch_emit_move (low_dest, loongarch_subword (src, false));
+	}
+      else
+	{
+	  loongarch_emit_move (low_dest, loongarch_subword (src, false));
+	  loongarch_emit_move (loongarch_subword (dest, true),
+			       loongarch_subword (src, true));
+	}
+    }
+
+  /* This is a hack.  See if the next insn uses DEST and if so, see if we
+     can forward SRC for DEST.  This is most useful if the next insn is a
+     simple store.  */
+  rtx_insn *insn = (rtx_insn *) insn_;
+  struct loongarch_address_info addr = {};
+  if (insn)
+    {
+      rtx_insn *next = next_nonnote_nondebug_insn_bb (insn);
+      if (next)
+	{
+	  rtx set = single_set (next);
+	  if (set && SET_SRC (set) == dest)
+	    {
+	      if (MEM_P (src))
+		{
+		  rtx tmp = XEXP (src, 0);
+		  loongarch_classify_address (&addr, tmp, GET_MODE (tmp),
+					      true);
+		  if (addr.reg && !reg_overlap_mentioned_p (dest, addr.reg))
+		    validate_change (next, &SET_SRC (set), src, false);
+		}
+	      else
+		validate_change (next, &SET_SRC (set), src, false);
+	    }
+	}
+    }
+}
+
+/* Return true if a move from SRC to DEST in INSN should be split.  */
+
+bool
+loongarch_split_move_insn_p (rtx dest, rtx src)
+{
+  return loongarch_split_move_p (dest, src);
+}
+
+/* Split a move from SRC to DEST in INSN, given that
+   loongarch_split_move_insn_p holds.  */
+
+void
+loongarch_split_move_insn (rtx dest, rtx src, rtx insn)
+{
+  loongarch_split_move (dest, src, insn);
+}
+
+/* Implement TARGET_CONSTANT_ALIGNMENT.  */
+
+static HOST_WIDE_INT
+loongarch_constant_alignment (const_tree exp, HOST_WIDE_INT align)
+{
+  if (TREE_CODE (exp) == STRING_CST || TREE_CODE (exp) == CONSTRUCTOR)
+    return MAX (align, BITS_PER_WORD);
+  return align;
+}
+
+const char *
+loongarch_output_move_index (rtx x, machine_mode mode, bool ldr)
+{
+  int index = exact_log2 (GET_MODE_SIZE (mode));
+  if (!IN_RANGE (index, 0, 3))
+    return NULL;
+
+  struct loongarch_address_info info;
+  if ((loongarch_classify_address (&info, x, mode, false)
+       && !(info.type == ADDRESS_REG_REG))
+      || !loongarch_legitimate_address_p (mode, x, false))
+    return NULL;
+
+  const char *const insn[][4] =
+    {
+      {
+	"stx.b\t%z1,%0",
+	"stx.h\t%z1,%0",
+	"stx.w\t%z1,%0",
+	"stx.d\t%z1,%0",
+      },
+      {
+	"ldx.bu\t%0,%1",
+	"ldx.hu\t%0,%1",
+	"ldx.w\t%0,%1",
+	"ldx.d\t%0,%1",
+      }
+    };
+
+  return insn[ldr][index];
+}
+
+const char *
+loongarch_output_move_index_float (rtx x, machine_mode mode, bool ldr)
+{
+  int index = exact_log2 (GET_MODE_SIZE (mode));
+  if (!IN_RANGE (index, 2, 3))
+    return NULL;
+
+  struct loongarch_address_info info;
+  if ((loongarch_classify_address (&info, x, mode, false)
+       && !(info.type == ADDRESS_REG_REG))
+      || !loongarch_legitimate_address_p (mode, x, false))
+    return NULL;
+
+  const char *const insn[][2] =
+    {
+	{
+	  "fstx.s\t%1,%0",
+	  "fstx.d\t%1,%0"
+	},
+	{
+	  "fldx.s\t%0,%1",
+	  "fldx.d\t%0,%1"
+	},
+    };
+
+  return insn[ldr][index-2];
+}
+
+/* Return the appropriate instructions to move SRC into DEST.  Assume
+   that SRC is operand 1 and DEST is operand 0.  */
+
+const char *
+loongarch_output_move (rtx dest, rtx src)
+{
+  enum rtx_code dest_code = GET_CODE (dest);
+  enum rtx_code src_code = GET_CODE (src);
+  machine_mode mode = GET_MODE (dest);
+  bool dbl_p = (GET_MODE_SIZE (mode) == 8);
+
+  if (loongarch_split_move_p (dest, src))
+    return "#";
+
+  if ((src_code == REG && GP_REG_P (REGNO (src)))
+      || (src == CONST0_RTX (mode)))
+    {
+      if (dest_code == REG)
+	{
+	  if (GP_REG_P (REGNO (dest)))
+	    return "or\t%0,%z1,$r0";
+
+	  if (FP_REG_P (REGNO (dest)))
+	    return dbl_p ? "movgr2fr.d\t%0,%z1" : "movgr2fr.w\t%0,%z1";
+	}
+      if (dest_code == MEM)
+	{
+	  const char *insn = NULL;
+	  insn = loongarch_output_move_index (XEXP (dest, 0), GET_MODE (dest),
+					      false);
+	  if (insn)
+	    return insn;
+
+	  rtx offset = XEXP (dest, 0);
+	  if (GET_CODE (offset) == PLUS)
+	    offset = XEXP (offset, 1);
+	  switch (GET_MODE_SIZE (mode))
+	    {
+	    case 1:
+	      return "st.b\t%z1,%0";
+	    case 2:
+	      return "st.h\t%z1,%0";
+	    case 4:
+	      if (const_arith_operand (offset, Pmode))
+		return "st.w\t%z1,%0";
+	      else
+		return "stptr.w\t%z1,%0";
+	    case 8:
+	      if (const_arith_operand (offset, Pmode))
+		return "st.d\t%z1,%0";
+	      else
+		return "stptr.d\t%z1,%0";
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+    }
+  if (dest_code == REG && GP_REG_P (REGNO (dest)))
+    {
+      if (src_code == REG)
+	if (FP_REG_P (REGNO (src)))
+	  return dbl_p ? "movfr2gr.d\t%0,%1" : "movfr2gr.s\t%0,%1";
+
+      if (src_code == MEM)
+	{
+	  const char *insn = NULL;
+	  insn = loongarch_output_move_index (XEXP (src, 0), GET_MODE (src),
+					      true);
+	  if (insn)
+	    return insn;
+
+	  rtx offset = XEXP (src, 0);
+	  if (GET_CODE (offset) == PLUS)
+	    offset = XEXP (offset, 1);
+	  switch (GET_MODE_SIZE (mode))
+	    {
+	    case 1:
+	      return "ld.bu\t%0,%1";
+	    case 2:
+	      return "ld.hu\t%0,%1";
+	    case 4:
+	      if (const_arith_operand (offset, Pmode))
+		return "ld.w\t%0,%1";
+	      else
+		return "ldptr.w\t%0,%1";
+	    case 8:
+	      if (const_arith_operand (offset, Pmode))
+		return "ld.d\t%0,%1";
+	      else
+		return "ldptr.d\t%0,%1";
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+
+      if (src_code == CONST_INT)
+	{
+	  if (LU12I_INT (src))
+	    return "lu12i.w\t%0,%1>>12\t\t\t# %X1";
+	  else if (IMM12_INT (src))
+	    return "addi.w\t%0,$r0,%1\t\t\t# %X1";
+	  else if (IMM12_INT_UNSIGNED (src))
+	    return "ori\t%0,$r0,%1\t\t\t# %X1";
+	  else if (LU52I_INT (src))
+	    return "lu52i.d\t%0,$r0,%X1>>52\t\t\t# %1";
+	  else
+	    gcc_unreachable ();
+	}
+
+      if (symbolic_operand (src, VOIDmode))
+	{
+	  if ((TARGET_CMODEL_TINY && (!loongarch_global_symbol_p (src)
+				      || loongarch_symbol_binds_local_p (src)))
+	      || (TARGET_CMODEL_TINY_STATIC && !loongarch_weak_symbol_p (src)))
+	    {
+	      /* The symbol must be aligned to 4 byte.  */
+	      unsigned int align;
+
+	      if (LABEL_REF_P (src))
+		align = 32 /* Whatever.  */;
+	      else if (CONSTANT_POOL_ADDRESS_P (src))
+		align = GET_MODE_ALIGNMENT (get_pool_mode (src));
+	      else if (TREE_CONSTANT_POOL_ADDRESS_P (src))
+		{
+		  tree exp = SYMBOL_REF_DECL (src);
+		  align = TYPE_ALIGN (TREE_TYPE (exp));
+		  align = loongarch_constant_alignment (exp, align);
+		}
+	      else if (SYMBOL_REF_DECL (src))
+		align = DECL_ALIGN (SYMBOL_REF_DECL (src));
+	      else if (SYMBOL_REF_HAS_BLOCK_INFO_P (src)
+		       && SYMBOL_REF_BLOCK (src) != NULL)
+		align = SYMBOL_REF_BLOCK (src)->alignment;
+	      else
+		align = BITS_PER_UNIT;
+
+	      if (align % (4 * 8) == 0)
+		return "pcaddi\t%0,%%pcrel(%1)>>2";
+	    }
+	  if (TARGET_CMODEL_TINY
+	      || TARGET_CMODEL_TINY_STATIC
+	      || TARGET_CMODEL_NORMAL
+	      || TARGET_CMODEL_LARGE)
+	    {
+	      if (!loongarch_global_symbol_p (src)
+		  || loongarch_symbol_binds_local_p (src))
+		return "la.local\t%0,%1";
+	      else
+		return "la.global\t%0,%1";
+	    }
+	  if (TARGET_CMODEL_EXTREME)
+	    {
+	      sorry ("Normal symbol loading not implemented in extreme mode.");
+	      gcc_unreachable ();
+	    }
+
+	}
+    }
+  if (src_code == REG && FP_REG_P (REGNO (src)))
+    {
+      if (dest_code == REG && FP_REG_P (REGNO (dest)))
+	return dbl_p ? "fmov.d\t%0,%1" : "fmov.s\t%0,%1";
+
+      if (dest_code == MEM)
+	{
+	  const char *insn = NULL;
+	  insn = loongarch_output_move_index_float (XEXP (dest, 0),
+						    GET_MODE (dest),
+						    false);
+	  if (insn)
+	    return insn;
+
+	  return dbl_p ? "fst.d\t%1,%0" : "fst.s\t%1,%0";
+	}
+    }
+  if (dest_code == REG && FP_REG_P (REGNO (dest)))
+    {
+      if (src_code == MEM)
+	{
+	  const char *insn = NULL;
+	  insn = loongarch_output_move_index_float (XEXP (src, 0),
+						    GET_MODE (src),
+						    true);
+	  if (insn)
+	    return insn;
+
+	  return dbl_p ? "fld.d\t%0,%1" : "fld.s\t%0,%1";
+	}
+    }
+  gcc_unreachable ();
+}
+
+/* Return true if CMP1 is a suitable second operand for integer ordering
+   test CODE.  */
+
+static bool
+loongarch_int_order_operand_ok_p (enum rtx_code code, rtx cmp1)
+{
+  switch (code)
+    {
+    case GT:
+    case GTU:
+      return reg_or_0_operand (cmp1, VOIDmode);
+
+    case GE:
+    case GEU:
+      return cmp1 == const1_rtx;
+
+    case LT:
+    case LTU:
+      return arith_operand (cmp1, VOIDmode);
+
+    case LE:
+      return sle_operand (cmp1, VOIDmode);
+
+    case LEU:
+      return sleu_operand (cmp1, VOIDmode);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Return true if *CMP1 (of mode MODE) is a valid second operand for
+   integer ordering test *CODE, or if an equivalent combination can
+   be formed by adjusting *CODE and *CMP1.  When returning true, update
+   *CODE and *CMP1 with the chosen code and operand, otherwise leave
+   them alone.  */
+
+static bool
+loongarch_canonicalize_int_order_test (enum rtx_code *code, rtx *cmp1,
+				       machine_mode mode)
+{
+  HOST_WIDE_INT plus_one;
+
+  if (loongarch_int_order_operand_ok_p (*code, *cmp1))
+    return true;
+
+  if (CONST_INT_P (*cmp1))
+    switch (*code)
+      {
+      case LE:
+	plus_one = trunc_int_for_mode (UINTVAL (*cmp1) + 1, mode);
+	if (INTVAL (*cmp1) < plus_one)
+	  {
+	    *code = LT;
+	    *cmp1 = force_reg (mode, GEN_INT (plus_one));
+	    return true;
+	  }
+	break;
+
+      case LEU:
+	plus_one = trunc_int_for_mode (UINTVAL (*cmp1) + 1, mode);
+	if (plus_one != 0)
+	  {
+	    *code = LTU;
+	    *cmp1 = force_reg (mode, GEN_INT (plus_one));
+	    return true;
+	  }
+	break;
+
+      default:
+	break;
+      }
+  return false;
+}
+
+/* Compare CMP0 and CMP1 using ordering test CODE and store the result
+   in TARGET.  CMP0 and TARGET are register_operands.  If INVERT_PTR
+   is nonnull, it's OK to set TARGET to the inverse of the result and
+   flip *INVERT_PTR instead.  */
+
+static void
+loongarch_emit_int_order_test (enum rtx_code code, bool *invert_ptr,
+			       rtx target, rtx cmp0, rtx cmp1)
+{
+  machine_mode mode;
+
+  /* First see if there is a LoongArch instruction that can do this operation.
+     If not, try doing the same for the inverse operation.  If that also
+     fails, force CMP1 into a register and try again.  */
+  mode = GET_MODE (cmp0);
+  if (loongarch_canonicalize_int_order_test (&code, &cmp1, mode))
+    loongarch_emit_binary (code, target, cmp0, cmp1);
+  else
+    {
+      enum rtx_code inv_code = reverse_condition (code);
+      if (!loongarch_canonicalize_int_order_test (&inv_code, &cmp1, mode))
+	{
+	  cmp1 = force_reg (mode, cmp1);
+	  loongarch_emit_int_order_test (code, invert_ptr, target, cmp0, cmp1);
+	}
+      else if (invert_ptr == 0)
+	{
+	  rtx inv_target;
+
+	  inv_target = loongarch_force_binary (GET_MODE (target),
+					       inv_code, cmp0, cmp1);
+	  loongarch_emit_binary (XOR, target, inv_target, const1_rtx);
+	}
+      else
+	{
+	  *invert_ptr = !*invert_ptr;
+	  loongarch_emit_binary (inv_code, target, cmp0, cmp1);
+	}
+    }
+}
+
+/* Return a register that is zero if CMP0 and CMP1 are equal.
+   The register will have the same mode as CMP0.  */
+
+static rtx
+loongarch_zero_if_equal (rtx cmp0, rtx cmp1)
+{
+  if (cmp1 == const0_rtx)
+    return cmp0;
+
+  if (uns_arith_operand (cmp1, VOIDmode))
+    return expand_binop (GET_MODE (cmp0), xor_optab, cmp0, cmp1, 0, 0,
+			 OPTAB_DIRECT);
+
+  return expand_binop (GET_MODE (cmp0), sub_optab, cmp0, cmp1, 0, 0,
+		       OPTAB_DIRECT);
+}
+
+/* Allocate a floating-point condition-code register of mode MODE.  */
+
+static rtx
+loongarch_allocate_fcc (machine_mode mode)
+{
+  unsigned int regno, count;
+
+  gcc_assert (TARGET_HARD_FLOAT);
+
+  if (mode == FCCmode)
+    count = 1;
+  else
+    gcc_unreachable ();
+
+  cfun->machine->next_fcc += -cfun->machine->next_fcc & (count - 1);
+  if (cfun->machine->next_fcc > FCC_REG_LAST - FCC_REG_FIRST)
+    cfun->machine->next_fcc = 0;
+
+  regno = FCC_REG_FIRST + cfun->machine->next_fcc;
+  cfun->machine->next_fcc += count;
+  return gen_rtx_REG (mode, regno);
+}
+
+/* Sign- or zero-extend OP0 and OP1 for integer comparisons.  */
+
+static void
+loongarch_extend_comparands (rtx_code code, rtx *op0, rtx *op1)
+{
+  /* Comparisons consider all XLEN bits, so extend sub-XLEN values.  */
+  if (GET_MODE_SIZE (word_mode) > GET_MODE_SIZE (GET_MODE (*op0)))
+    {
+      /* TODO: checkout It is more profitable to zero-extend QImode values.  */
+      if (unsigned_condition (code) == code && GET_MODE (*op0) == QImode)
+	{
+	  *op0 = gen_rtx_ZERO_EXTEND (word_mode, *op0);
+	  if (CONST_INT_P (*op1))
+	    *op1 = GEN_INT ((uint8_t) INTVAL (*op1));
+	  else
+	    *op1 = gen_rtx_ZERO_EXTEND (word_mode, *op1);
+	}
+      else
+	{
+	  *op0 = gen_rtx_SIGN_EXTEND (word_mode, *op0);
+	  if (*op1 != const0_rtx)
+	    *op1 = gen_rtx_SIGN_EXTEND (word_mode, *op1);
+	}
+    }
+}
+
+/* Convert a comparison into something that can be used in a branch.  On
+   entry, *OP0 and *OP1 are the values being compared and *CODE is the code
+   used to compare them.  Update them to describe the final comparison.  */
+
+static void
+loongarch_emit_int_compare (enum rtx_code *code, rtx *op0, rtx *op1)
+{
+  static const enum rtx_code
+  mag_comparisons[][2] = {{LEU, LTU}, {GTU, GEU}, {LE, LT}, {GT, GE}};
+
+  if (splittable_const_int_operand (*op1, VOIDmode))
+    {
+      HOST_WIDE_INT rhs = INTVAL (*op1);
+
+      if (*code == EQ || *code == NE)
+	{
+	  /* Convert e.g. OP0 == 2048 into OP0 - 2048 == 0.  */
+	  if (IMM12_OPERAND (-rhs))
+	    {
+	      *op0 = loongarch_force_binary (GET_MODE (*op0), PLUS, *op0,
+					     GEN_INT (-rhs));
+	      *op1 = const0_rtx;
+	    }
+	}
+      else
+	{
+	  /* Convert e.g. (OP0 <= 0xFFF) into (OP0 < 0x1000).  */
+	  for (size_t i = 0; i < ARRAY_SIZE (mag_comparisons); i++)
+	    {
+	      HOST_WIDE_INT new_rhs;
+	      bool increment = *code == mag_comparisons[i][0];
+	      bool decrement = *code == mag_comparisons[i][1];
+	      if (!increment && !decrement)
+		continue;
+
+	      new_rhs = rhs + (increment ? 1 : -1);
+	      if (loongarch_integer_cost (new_rhs)
+		    < loongarch_integer_cost (rhs)
+		  && (rhs < 0) == (new_rhs < 0))
+		{
+		  *op1 = GEN_INT (new_rhs);
+		  *code = mag_comparisons[i][increment];
+		}
+	      break;
+	    }
+	}
+    }
+
+  loongarch_extend_comparands (*code, op0, op1);
+
+  *op0 = force_reg (word_mode, *op0);
+  if (*op1 != const0_rtx)
+    *op1 = force_reg (word_mode, *op1);
+}
+
+/* Like loongarch_emit_int_compare, but for floating-point comparisons.  */
+
+static void
+loongarch_emit_float_compare (enum rtx_code *code, rtx *op0, rtx *op1)
+{
+  rtx cmp_op0 = *op0;
+  rtx cmp_op1 = *op1;
+
+  /* Floating-point tests use a separate FCMP.cond.fmt
+     comparison to set a register.  The branch or conditional move will
+     then compare that register against zero.
+
+     Set CMP_CODE to the code of the comparison instruction and
+     *CODE to the code that the branch or move should use.  */
+  enum rtx_code cmp_code = *code;
+  /* Three FP conditions cannot be implemented by reversing the
+     operands for FCMP.cond.fmt, instead a reversed condition code is
+     required and a test for false.  */
+  *code = NE;
+  *op0 = loongarch_allocate_fcc (FCCmode);
+
+  *op1 = const0_rtx;
+  loongarch_emit_binary (cmp_code, *op0, cmp_op0, cmp_op1);
+}
+
+/* Try performing the comparison in OPERANDS[1], whose arms are OPERANDS[2]
+   and OPERAND[3].  Store the result in OPERANDS[0].
+
+   On 64-bit targets, the mode of the comparison and target will always be
+   SImode, thus possibly narrower than that of the comparison's operands.  */
+
+void
+loongarch_expand_scc (rtx operands[])
+{
+  rtx target = operands[0];
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx op0 = operands[2];
+  rtx op1 = operands[3];
+
+  loongarch_extend_comparands (code, &op0, &op1);
+  op0 = force_reg (word_mode, op0);
+
+  gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT);
+
+  if (code == EQ || code == NE)
+    {
+      rtx zie = loongarch_zero_if_equal (op0, op1);
+      loongarch_emit_binary (code, target, zie, const0_rtx);
+    }
+  else
+    loongarch_emit_int_order_test (code, 0, target, op0, op1);
+}
+
+/* Compare OPERANDS[1] with OPERANDS[2] using comparison code
+   CODE and jump to OPERANDS[3] if the condition holds.  */
+
+void
+loongarch_expand_conditional_branch (rtx *operands)
+{
+  enum rtx_code code = GET_CODE (operands[0]);
+  rtx op0 = operands[1];
+  rtx op1 = operands[2];
+  rtx condition;
+
+  if (FLOAT_MODE_P (GET_MODE (op1)))
+    loongarch_emit_float_compare (&code, &op0, &op1);
+  else
+    loongarch_emit_int_compare (&code, &op0, &op1);
+
+  condition = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
+  emit_jump_insn (gen_condjump (condition, operands[3]));
+}
+
+/* Perform the comparison in OPERANDS[1].  Move OPERANDS[2] into OPERANDS[0]
+   if the condition holds, otherwise move OPERANDS[3] into OPERANDS[0].  */
+
+void
+loongarch_expand_conditional_move (rtx *operands)
+{
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx op0 = XEXP (operands[1], 0);
+  rtx op1 = XEXP (operands[1], 1);
+
+  if (FLOAT_MODE_P (GET_MODE (op1)))
+    loongarch_emit_float_compare (&code, &op0, &op1);
+  else
+    {
+      loongarch_extend_comparands (code, &op0, &op1);
+
+      op0 = force_reg (word_mode, op0);
+
+      if (code == EQ || code == NE)
+	{
+	  op0 = loongarch_zero_if_equal (op0, op1);
+	  op1 = const0_rtx;
+	}
+      else
+	{
+	  /* The comparison needs a separate scc instruction.  Store the
+	     result of the scc in *OP0 and compare it against zero.  */
+	  bool invert = false;
+	  rtx target = gen_reg_rtx (GET_MODE (op0));
+	  loongarch_emit_int_order_test (code, &invert, target, op0, op1);
+	  code = invert ? EQ : NE;
+	  op0 = target;
+	  op1 = const0_rtx;
+	}
+    }
+
+  rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
+  /* There is no direct support for general conditional GP move involving
+     two registers using SEL.  */
+  if (INTEGRAL_MODE_P (GET_MODE (operands[2]))
+      && register_operand (operands[2], VOIDmode)
+      && register_operand (operands[3], VOIDmode))
+    {
+      machine_mode mode = GET_MODE (operands[0]);
+      rtx temp = gen_reg_rtx (mode);
+      rtx temp2 = gen_reg_rtx (mode);
+
+      emit_insn (gen_rtx_SET (temp,
+			      gen_rtx_IF_THEN_ELSE (mode, cond,
+						    operands[2], const0_rtx)));
+
+      /* Flip the test for the second operand.  */
+      cond = gen_rtx_fmt_ee ((code == EQ) ? NE : EQ, GET_MODE (op0), op0, op1);
+
+      emit_insn (gen_rtx_SET (temp2,
+			      gen_rtx_IF_THEN_ELSE (mode, cond,
+						    operands[3], const0_rtx)));
+
+      /* Merge the two results, at least one is guaranteed to be zero.  */
+      emit_insn (gen_rtx_SET (operands[0], gen_rtx_IOR (mode, temp, temp2)));
+    }
+  else
+    emit_insn (gen_rtx_SET (operands[0],
+			    gen_rtx_IF_THEN_ELSE (GET_MODE (operands[0]), cond,
+						  operands[2], operands[3])));
+}
+
+/* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
+
+static void
+loongarch_va_start (tree valist, rtx nextarg)
+{
+  nextarg = plus_constant (Pmode, nextarg, -cfun->machine->varargs_size);
+  std_expand_builtin_va_start (valist, nextarg);
+}
+
+/* Implement TARGET_FUNCTION_OK_FOR_SIBCALL.  */
+
+static bool
+loongarch_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
+				   tree exp ATTRIBUTE_UNUSED)
+{
+  /* Always OK.  */
+  return true;
+}
+
+/* Emit straight-line code to move LENGTH bytes from SRC to DEST.
+   Assume that the areas do not overlap.  */
+
+static void
+loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length)
+{
+  HOST_WIDE_INT offset, delta;
+  unsigned HOST_WIDE_INT bits;
+  int i;
+  machine_mode mode;
+  rtx *regs;
+
+  bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest)));
+
+  mode = int_mode_for_size (bits, 0).require ();
+  delta = bits / BITS_PER_UNIT;
+
+  /* Allocate a buffer for the temporary registers.  */
+  regs = XALLOCAVEC (rtx, length / delta);
+
+  /* Load as many BITS-sized chunks as possible.  Use a normal load if
+     the source has enough alignment, otherwise use left/right pairs.  */
+  for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
+    {
+      regs[i] = gen_reg_rtx (mode);
+      loongarch_emit_move (regs[i], adjust_address (src, mode, offset));
+    }
+
+  for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
+    loongarch_emit_move (adjust_address (dest, mode, offset), regs[i]);
+
+  /* Mop up any left-over bytes.  */
+  if (offset < length)
+    {
+      src = adjust_address (src, BLKmode, offset);
+      dest = adjust_address (dest, BLKmode, offset);
+      move_by_pieces (dest, src, length - offset,
+		      MIN (MEM_ALIGN (src), MEM_ALIGN (dest)),
+		      (enum memop_ret) 0);
+    }
+}
+
+/* Helper function for doing a loop-based block operation on memory
+   reference MEM.  Each iteration of the loop will operate on LENGTH
+   bytes of MEM.
+
+   Create a new base register for use within the loop and point it to
+   the start of MEM.  Create a new memory reference that uses this
+   register.  Store them in *LOOP_REG and *LOOP_MEM respectively.  */
+
+static void
+loongarch_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
+			    rtx *loop_mem)
+{
+  *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
+
+  /* Although the new mem does not refer to a known location,
+     it does keep up to LENGTH bytes of alignment.  */
+  *loop_mem = change_address (mem, BLKmode, *loop_reg);
+  set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
+}
+
+/* Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
+   bytes at a time.  LENGTH must be at least BYTES_PER_ITER.  Assume that
+   the memory regions do not overlap.  */
+
+static void
+loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
+			   HOST_WIDE_INT bytes_per_iter)
+{
+  rtx_code_label *label;
+  rtx src_reg, dest_reg, final_src, test;
+  HOST_WIDE_INT leftover;
+
+  leftover = length % bytes_per_iter;
+  length -= leftover;
+
+  /* Create registers and memory references for use within the loop.  */
+  loongarch_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
+  loongarch_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
+
+  /* Calculate the value that SRC_REG should have after the last iteration
+     of the loop.  */
+  final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length), 0,
+				   0, OPTAB_WIDEN);
+
+  /* Emit the start of the loop.  */
+  label = gen_label_rtx ();
+  emit_label (label);
+
+  /* Emit the loop body.  */
+  loongarch_block_move_straight (dest, src, bytes_per_iter);
+
+  /* Move on to the next block.  */
+  loongarch_emit_move (src_reg,
+		       plus_constant (Pmode, src_reg, bytes_per_iter));
+  loongarch_emit_move (dest_reg,
+		       plus_constant (Pmode, dest_reg, bytes_per_iter));
+
+  /* Emit the loop condition.  */
+  test = gen_rtx_NE (VOIDmode, src_reg, final_src);
+  if (Pmode == DImode)
+    emit_jump_insn (gen_cbranchdi4 (test, src_reg, final_src, label));
+  else
+    emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
+
+  /* Mop up any left-over bytes.  */
+  if (leftover)
+    loongarch_block_move_straight (dest, src, leftover);
+  else
+    /* Temporary fix for PR79150.  */
+    emit_insn (gen_nop ());
+}
+
+/* Expand a cpymemsi instruction, which copies LENGTH bytes from
+   memory reference SRC to memory reference DEST.  */
+
+bool
+loongarch_expand_block_move (rtx dest, rtx src, rtx length)
+{
+  int max_move_bytes = LARCH_MAX_MOVE_BYTES_STRAIGHT;
+
+  if (CONST_INT_P (length)
+      && INTVAL (length) <= loongarch_max_inline_memcpy_size)
+    {
+      if (INTVAL (length) <= max_move_bytes)
+	{
+	  loongarch_block_move_straight (dest, src, INTVAL (length));
+	  return true;
+	}
+      else if (optimize)
+	{
+	  loongarch_block_move_loop (dest, src, INTVAL (length),
+				     LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER);
+	  return true;
+	}
+    }
+  return false;
+}
+
+/* Return true if loongarch_expand_block_move is the preferred
+   implementation of the 'cpymemsi' template.  */
+
+bool
+loongarch_do_optimize_block_move_p (void)
+{
+  /* if -m[no-]memcpy is given explicitly.  */
+  if (target_flags_explicit & MASK_MEMCPY)
+    return !TARGET_MEMCPY;
+
+  /* if not, don't optimize under -Os.  */
+  return !optimize_size;
+}
+
+
+/* Expand a QI or HI mode atomic memory operation.
+
+   GENERATOR contains a pointer to the gen_* function that generates
+   the SI mode underlying atomic operation using masks that we
+   calculate.
+
+   RESULT is the return register for the operation.  Its value is NULL
+   if unused.
+
+   MEM is the location of the atomic access.
+
+   OLDVAL is the first operand for the operation.
+
+   NEWVAL is the optional second operand for the operation.  Its value
+   is NULL if unused.  */
+
+void
+loongarch_expand_atomic_qihi (union loongarch_gen_fn_ptrs generator,
+			      rtx result, rtx mem, rtx oldval, rtx newval,
+			      rtx model)
+{
+  rtx orig_addr, memsi_addr, memsi, shift, shiftsi, unshifted_mask;
+  rtx unshifted_mask_reg, mask, inverted_mask, si_op;
+  rtx res = NULL;
+  machine_mode mode;
+
+  mode = GET_MODE (mem);
+
+  /* Compute the address of the containing SImode value.  */
+  orig_addr = force_reg (Pmode, XEXP (mem, 0));
+  memsi_addr = loongarch_force_binary (Pmode, AND, orig_addr,
+				       force_reg (Pmode, GEN_INT (-4)));
+
+  /* Create a memory reference for it.  */
+  memsi = gen_rtx_MEM (SImode, memsi_addr);
+  set_mem_alias_set (memsi, ALIAS_SET_MEMORY_BARRIER);
+  MEM_VOLATILE_P (memsi) = MEM_VOLATILE_P (mem);
+
+  /* Work out the byte offset of the QImode or HImode value,
+     counting from the least significant byte.  */
+  shift = loongarch_force_binary (Pmode, AND, orig_addr, GEN_INT (3));
+  /* Multiply by eight to convert the shift value from bytes to bits.  */
+  loongarch_emit_binary (ASHIFT, shift, shift, GEN_INT (3));
+
+  /* Make the final shift an SImode value, so that it can be used in
+     SImode operations.  */
+  shiftsi = force_reg (SImode, gen_lowpart (SImode, shift));
+
+  /* Set MASK to an inclusive mask of the QImode or HImode value.  */
+  unshifted_mask = GEN_INT (GET_MODE_MASK (mode));
+  unshifted_mask_reg = force_reg (SImode, unshifted_mask);
+  mask = loongarch_force_binary (SImode, ASHIFT, unshifted_mask_reg, shiftsi);
+
+  /* Compute the equivalent exclusive mask.  */
+  inverted_mask = gen_reg_rtx (SImode);
+  emit_insn (gen_rtx_SET (inverted_mask, gen_rtx_NOT (SImode, mask)));
+
+  /* Shift the old value into place.  */
+  if (oldval != const0_rtx)
+    {
+      oldval = convert_modes (SImode, mode, oldval, true);
+      oldval = force_reg (SImode, oldval);
+      oldval = loongarch_force_binary (SImode, ASHIFT, oldval, shiftsi);
+    }
+
+  /* Do the same for the new value.  */
+  if (newval && newval != const0_rtx)
+    {
+      newval = convert_modes (SImode, mode, newval, true);
+      newval = force_reg (SImode, newval);
+      newval = loongarch_force_binary (SImode, ASHIFT, newval, shiftsi);
+    }
+
+  /* Do the SImode atomic access.  */
+  if (result)
+    res = gen_reg_rtx (SImode);
+
+  if (newval)
+    si_op = generator.fn_7 (res, memsi, mask, inverted_mask, oldval, newval,
+			    model);
+  else if (result)
+    si_op = generator.fn_6 (res, memsi, mask, inverted_mask, oldval, model);
+  else
+    si_op = generator.fn_5 (memsi, mask, inverted_mask, oldval, model);
+
+  emit_insn (si_op);
+
+  if (result)
+    {
+      /* Shift and convert the result.  */
+      loongarch_emit_binary (AND, res, res, mask);
+      loongarch_emit_binary (LSHIFTRT, res, res, shiftsi);
+      loongarch_emit_move (result, gen_lowpart (GET_MODE (result), res));
+    }
+}
+
+/* Return true if (zero_extract OP WIDTH BITPOS) can be used as the
+   source of an "ext" instruction or the destination of an "ins"
+   instruction.  OP must be a register operand and the following
+   conditions must hold:
+
+   0 <= BITPOS < GET_MODE_BITSIZE (GET_MODE (op))
+   0 < WIDTH <= GET_MODE_BITSIZE (GET_MODE (op))
+   0 < BITPOS + WIDTH <= GET_MODE_BITSIZE (GET_MODE (op))
+
+   Also reject lengths equal to a word as they are better handled
+   by the move patterns.  */
+
+bool
+loongarch_use_ins_ext_p (rtx op, HOST_WIDE_INT width, HOST_WIDE_INT bitpos)
+{
+  if (!register_operand (op, VOIDmode)
+      || GET_MODE_BITSIZE (GET_MODE (op)) > BITS_PER_WORD)
+    return false;
+
+  if (!IN_RANGE (width, 1, GET_MODE_BITSIZE (GET_MODE (op)) - 1))
+    return false;
+
+  if (bitpos < 0 || bitpos + width > GET_MODE_BITSIZE (GET_MODE (op)))
+    return false;
+
+  return true;
+}
+
+/* Print the text for PRINT_OPERAND punctation character CH to FILE.
+   The punctuation characters are:
+
+   '.'	Print the name of the register with a hard-wired zero (zero or $r0).
+   '$'	Print the name of the stack pointer register (sp or $r3).
+
+   See also loongarch_init_print_operand_punct.  */
+
+static void
+loongarch_print_operand_punctuation (FILE *file, int ch)
+{
+  switch (ch)
+    {
+    case '.':
+      fputs (reg_names[GP_REG_FIRST + 0], file);
+      break;
+
+    case '$':
+      fputs (reg_names[STACK_POINTER_REGNUM], file);
+      break;
+
+    default:
+      gcc_unreachable ();
+      break;
+    }
+}
+
+/* Initialize loongarch_print_operand_punct.  */
+
+static void
+loongarch_init_print_operand_punct (void)
+{
+  const char *p;
+
+  for (p = ".$"; *p; p++)
+    loongarch_print_operand_punct[(unsigned char) *p] = true;
+}
+
+/* PRINT_OPERAND prefix LETTER refers to the integer branch instruction
+   associated with condition CODE.  Print the condition part of the
+   opcode to FILE.  */
+
+static void
+loongarch_print_int_branch_condition (FILE *file, enum rtx_code code,
+				      int letter)
+{
+  switch (code)
+    {
+    case EQ:
+    case NE:
+    case GT:
+    case GE:
+    case LT:
+    case LE:
+    case GTU:
+    case GEU:
+    case LTU:
+    case LEU:
+      /* Conveniently, the LoongArch names for these conditions are the same
+	 as their RTL equivalents.  */
+      fputs (GET_RTX_NAME (code), file);
+      break;
+
+    default:
+      output_operand_lossage ("'%%%c' is not a valid operand prefix", letter);
+      break;
+    }
+}
+
+/* Likewise floating-point branches.  */
+
+static void
+loongarch_print_float_branch_condition (FILE *file, enum rtx_code code,
+					int letter)
+{
+  switch (code)
+    {
+    case EQ:
+      fputs ("ceqz", file);
+      break;
+
+    case NE:
+      fputs ("cnez", file);
+      break;
+
+    default:
+      output_operand_lossage ("'%%%c' is not a valid operand prefix", letter);
+      break;
+    }
+}
+
+/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P.  */
+
+static bool
+loongarch_print_operand_punct_valid_p (unsigned char code)
+{
+  return loongarch_print_operand_punct[code];
+}
+
+/* Return true if a FENCE should be emitted to before a memory access to
+   implement the release portion of memory model MODEL.  */
+
+static bool
+loongarch_memmodel_needs_rel_acq_fence (enum memmodel model)
+{
+  switch (model)
+    {
+      case MEMMODEL_ACQ_REL:
+      case MEMMODEL_SEQ_CST:
+      case MEMMODEL_SYNC_SEQ_CST:
+      case MEMMODEL_RELEASE:
+      case MEMMODEL_SYNC_RELEASE:
+      case MEMMODEL_ACQUIRE:
+      case MEMMODEL_CONSUME:
+      case MEMMODEL_SYNC_ACQUIRE:
+	return true;
+
+      case MEMMODEL_RELAXED:
+	return false;
+
+      default:
+	gcc_unreachable ();
+    }
+}
+
+/* Return true if a FENCE should be emitted to before a memory access to
+   implement the release portion of memory model MODEL.  */
+
+static bool
+loongarch_memmodel_needs_release_fence (enum memmodel model)
+{
+  switch (model)
+    {
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+    case MEMMODEL_SYNC_SEQ_CST:
+    case MEMMODEL_RELEASE:
+    case MEMMODEL_SYNC_RELEASE:
+      return true;
+
+    case MEMMODEL_ACQUIRE:
+    case MEMMODEL_CONSUME:
+    case MEMMODEL_SYNC_ACQUIRE:
+    case MEMMODEL_RELAXED:
+      return false;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Implement TARGET_PRINT_OPERAND.  The LoongArch-specific operand codes are:
+
+   'X'	Print CONST_INT OP in hexadecimal format.
+   'x'	Print the low 16 bits of CONST_INT OP in hexadecimal format.
+   'd'	Print CONST_INT OP in decimal.
+   'm'	Print one less than CONST_INT OP in decimal.
+   'y'	Print exact log2 of CONST_INT OP in decimal.
+   'C'	Print the integer branch condition for comparison OP.
+   'N'	Print the inverse of the integer branch condition for comparison OP.
+   'F'	Print the FPU branch condition for comparison OP.
+   'W'	Print the inverse of the FPU branch condition for comparison OP.
+   'T'	Print 'f' for (eq:CC ...), 't' for (ne:CC ...),
+	      'z' for (eq:?I ...), 'n' for (ne:?I ...).
+   't'	Like 'T', but with the EQ/NE cases reversed
+   'Y'	Print loongarch_fp_conditions[INTVAL (OP)]
+   'Z'	Print OP and a comma for 8CC, otherwise print nothing.
+   'z'	Print $0 if OP is zero, otherwise print OP normally.
+   'b'	Print the address of a memory operand, without offset.
+   'V'	Print exact log2 of CONST_INT OP element 0 of a replicated
+	  CONST_VECTOR in decimal.
+   'A'	Print a _DB suffix if the memory model requires a release.
+   'G'	Print a DBAR insn if the memory model requires a release.
+   'i'	Print i if the operand is not a register.  */
+
+static void
+loongarch_print_operand (FILE *file, rtx op, int letter)
+{
+  enum rtx_code code;
+
+  if (loongarch_print_operand_punct_valid_p (letter))
+    {
+      loongarch_print_operand_punctuation (file, letter);
+      return;
+    }
+
+  gcc_assert (op);
+  code = GET_CODE (op);
+
+  switch (letter)
+    {
+    case 'X':
+      if (CONST_INT_P (op))
+	fprintf (file, HOST_WIDE_INT_PRINT_HEX, INTVAL (op));
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
+    case 'x':
+      if (CONST_INT_P (op))
+	fprintf (file, HOST_WIDE_INT_PRINT_HEX, INTVAL (op) & 0xffff);
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
+    case 'd':
+      if (CONST_INT_P (op))
+	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (op));
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
+    case 'm':
+      if (CONST_INT_P (op))
+	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (op) - 1);
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
+    case 'y':
+      if (CONST_INT_P (op))
+	{
+	  int val = exact_log2 (INTVAL (op));
+	  if (val != -1)
+	    fprintf (file, "%d", val);
+	  else
+	    output_operand_lossage ("invalid use of '%%%c'", letter);
+	}
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
+    case 'V':
+      if (CONST_VECTOR_P (op))
+	{
+	  machine_mode mode = GET_MODE_INNER (GET_MODE (op));
+	  unsigned HOST_WIDE_INT val = UINTVAL (CONST_VECTOR_ELT (op, 0));
+	  int vlog2 = exact_log2 (val & GET_MODE_MASK (mode));
+	  if (vlog2 != -1)
+	    fprintf (file, "%d", vlog2);
+	  else
+	    output_operand_lossage ("invalid use of '%%%c'", letter);
+	}
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
+    case 'C':
+      loongarch_print_int_branch_condition (file, code, letter);
+      break;
+
+    case 'N':
+      loongarch_print_int_branch_condition (file, reverse_condition (code),
+					    letter);
+      break;
+
+    case 'F':
+      loongarch_print_float_branch_condition (file, code, letter);
+      break;
+
+    case 'W':
+      loongarch_print_float_branch_condition (file, reverse_condition (code),
+					      letter);
+      break;
+
+    case 'T':
+    case 't':
+      {
+	int truth = (code == NE) == (letter == 'T');
+	fputc ("zfnt"[truth * 2 + FCC_REG_P (REGNO (XEXP (op, 0)))], file);
+      }
+      break;
+
+    case 'Y':
+      if (code == CONST_INT
+	  && UINTVAL (op) < ARRAY_SIZE (loongarch_fp_conditions))
+	fputs (loongarch_fp_conditions[UINTVAL (op)], file);
+      else
+	output_operand_lossage ("'%%%c' is not a valid operand prefix",
+				letter);
+      break;
+
+    case 'Z':
+      loongarch_print_operand (file, op, 0);
+      fputc (',', file);
+      break;
+
+    case 'A':
+      if (loongarch_memmodel_needs_rel_acq_fence ((enum memmodel) INTVAL (op)))
+	fputs ("_db", file);
+      break;
+
+    case 'G':
+      if (loongarch_memmodel_needs_release_fence ((enum memmodel) INTVAL (op)))
+	fputs ("dbar\t0", file);
+      break;
+
+    case 'i':
+      if (code != REG)
+	fputs ("i", file);
+      break;
+
+    default:
+      switch (code)
+	{
+	case REG:
+	  {
+	    unsigned int regno = REGNO (op);
+	    if (letter && letter != 'z')
+	      output_operand_lossage ("invalid use of '%%%c'", letter);
+	    fprintf (file, "%s", reg_names[regno]);
+	  }
+	  break;
+
+	case MEM:
+	  if (letter == 'D')
+	    output_address (GET_MODE (op),
+			    plus_constant (Pmode, XEXP (op, 0), 4));
+	  else if (letter == 'b')
+	    {
+	      gcc_assert (REG_P (XEXP (op, 0)));
+	      loongarch_print_operand (file, XEXP (op, 0), 0);
+	    }
+	  else if (letter && letter != 'z')
+	    output_operand_lossage ("invalid use of '%%%c'", letter);
+	  else
+	    output_address (GET_MODE (op), XEXP (op, 0));
+	  break;
+
+	default:
+	  if (letter == 'z' && op == CONST0_RTX (GET_MODE (op)))
+	    fputs (reg_names[GP_REG_FIRST], file);
+	  else if (letter && letter != 'z')
+	    output_operand_lossage ("invalid use of '%%%c'", letter);
+	  else
+	    output_addr_const (file, loongarch_strip_unspec_address (op));
+	  break;
+	}
+    }
+}
+
+/* Implement TARGET_PRINT_OPERAND_ADDRESS.  */
+
+static void
+loongarch_print_operand_address (FILE *file, machine_mode /* mode  */, rtx x)
+{
+  struct loongarch_address_info addr;
+
+  if (loongarch_classify_address (&addr, x, word_mode, true))
+    switch (addr.type)
+      {
+      case ADDRESS_REG:
+	fprintf (file, "%s,", reg_names[REGNO (addr.reg)]);
+	loongarch_print_operand (file, addr.offset, 0);
+	return;
+
+      case ADDRESS_REG_REG:
+	fprintf (file, "%s,%s", reg_names[REGNO (addr.reg)],
+				reg_names[REGNO (addr.offset)]);
+	return;
+
+      case ADDRESS_CONST_INT:
+	fprintf (file, "%s,", reg_names[GP_REG_FIRST]);
+	output_addr_const (file, x);
+	return;
+
+      case ADDRESS_SYMBOLIC:
+	output_addr_const (file, loongarch_strip_unspec_address (x));
+	return;
+      }
+  if (CONST_INT_P (x))
+    output_addr_const (file, x);
+  else
+    gcc_unreachable ();
+}
+
+/* Implement TARGET_ASM_SELECT_RTX_SECTION.  */
+
+static section *
+loongarch_select_rtx_section (machine_mode mode, rtx x,
+			      unsigned HOST_WIDE_INT align)
+{
+  /* ??? Consider using mergeable small data sections.  */
+  if (loongarch_rtx_constant_in_small_data_p (mode))
+    return get_named_section (NULL, ".sdata", 0);
+
+  return default_elf_select_rtx_section (mode, x, align);
+}
+
+/* Implement TARGET_ASM_FUNCTION_RODATA_SECTION.
+
+   The complication here is that jump tables will use absolute addresses,
+   and should therefore not be included in the read-only part of a DSO.
+   Handle such cases by selecting a normal data section instead of a
+   read-only one.  The logic apes that in default_function_rodata_section.  */
+
+static section *
+loongarch_function_rodata_section (tree decl, bool)
+{
+  return default_function_rodata_section (decl, false);
+}
+
+/* Implement TARGET_IN_SMALL_DATA_P.  */
+
+static bool
+loongarch_in_small_data_p (const_tree decl)
+{
+  int size;
+
+  if (TREE_CODE (decl) == STRING_CST || TREE_CODE (decl) == FUNCTION_DECL)
+    return false;
+
+  if (VAR_P (decl) && DECL_SECTION_NAME (decl) != 0)
+    {
+      const char *name;
+
+      /* Reject anything that isn't in a known small-data section.  */
+      name = DECL_SECTION_NAME (decl);
+      if (strcmp (name, ".sdata") != 0 && strcmp (name, ".sbss") != 0)
+	return false;
+
+      /* If a symbol is defined externally, the assembler will use the
+	 usual -G rules when deciding how to implement macros.  */
+      if (!DECL_EXTERNAL (decl))
+	return true;
+    }
+
+  /* We have traditionally not treated zero-sized objects as small data,
+     so this is now effectively part of the ABI.  */
+  size = int_size_in_bytes (TREE_TYPE (decl));
+  return size > 0 && size <= g_switch_value;
+}
+
+/* The LoongArch debug format wants all automatic variables and arguments
+   to be in terms of the virtual frame pointer (stack pointer before
+   any adjustment in the function), while the LoongArch linker wants
+   the frame pointer to be the stack pointer after the initial
+   adjustment.  So, we do the adjustment here.  The arg pointer (which
+   is eliminated) points to the virtual frame pointer, while the frame
+   pointer (which may be eliminated) points to the stack pointer after
+   the initial adjustments.  */
+
+HOST_WIDE_INT
+loongarch_debugger_offset (rtx addr, HOST_WIDE_INT offset)
+{
+  rtx offset2 = const0_rtx;
+  rtx reg = eliminate_constant_term (addr, &offset2);
+
+  if (offset == 0)
+    offset = INTVAL (offset2);
+
+  if (reg == stack_pointer_rtx
+      || reg == frame_pointer_rtx
+      || reg == hard_frame_pointer_rtx)
+    {
+      offset -= cfun->machine->frame.total_size;
+      if (reg == hard_frame_pointer_rtx)
+	offset += cfun->machine->frame.hard_frame_pointer_offset;
+    }
+
+  return offset;
+}
+
+/* Implement ASM_OUTPUT_EXTERNAL.  */
+
+void
+loongarch_output_external (FILE *file, tree decl, const char *name)
+{
+  default_elf_asm_output_external (file, decl, name);
+
+  /* We output the name if and only if TREE_SYMBOL_REFERENCED is
+     set in order to avoid putting out names that are never really
+     used.  */
+  if (TREE_SYMBOL_REFERENCED (DECL_ASSEMBLER_NAME (decl)))
+    {
+      if (loongarch_in_small_data_p (decl))
+	{
+	  /* When using assembler macros, emit .extern directives for
+	     all small-data externs so that the assembler knows how
+	     big they are.
+
+	     In most cases it would be safe (though pointless) to emit
+	     .externs for other symbols too.  One exception is when an
+	     object is within the -G limit but declared by the user to
+	     be in a section other than .sbss or .sdata.  */
+	  fputs ("\t.extern\t", file);
+	  assemble_name (file, name);
+	  fprintf (file, ", " HOST_WIDE_INT_PRINT_DEC "\n",
+		   int_size_in_bytes (TREE_TYPE (decl)));
+	}
+    }
+}
+
+/* Implement TARGET_ASM_OUTPUT_DWARF_DTPREL.  */
+
+static void ATTRIBUTE_UNUSED
+loongarch_output_dwarf_dtprel (FILE *file, int size, rtx x)
+{
+  switch (size)
+    {
+    case 4:
+      fputs ("\t.dtprelword\t", file);
+      break;
+
+    case 8:
+      fputs ("\t.dtpreldword\t", file);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+  output_addr_const (file, x);
+  fputs ("+0x8000", file);
+}
+
+/* Implement ASM_OUTPUT_ASCII.  */
+
+void
+loongarch_output_ascii (FILE *stream, const char *string, size_t len)
+{
+  size_t i;
+  int cur_pos;
+
+  cur_pos = 17;
+  fprintf (stream, "\t.ascii\t\"");
+  for (i = 0; i < len; i++)
+    {
+      int c;
+
+      c = (unsigned char) string[i];
+      if (ISPRINT (c))
+	{
+	  if (c == '\\' || c == '\"')
+	    {
+	      putc ('\\', stream);
+	      cur_pos++;
+	    }
+	  putc (c, stream);
+	  cur_pos++;
+	}
+      else
+	{
+	  fprintf (stream, "\\%03o", c);
+	  cur_pos += 4;
+	}
+
+      if (cur_pos > 72 && i + 1 < len)
+	{
+	  cur_pos = 17;
+	  fprintf (stream, "\"\n\t.ascii\t\"");
+	}
+    }
+  fprintf (stream, "\"\n");
+}
+
+/* Implement TARGET_FRAME_POINTER_REQUIRED.  */
+
+static bool
+loongarch_frame_pointer_required (void)
+{
+  /* If the function contains dynamic stack allocations, we need to
+     use the frame pointer to access the static parts of the frame.  */
+  if (cfun->calls_alloca)
+    return true;
+
+  return false;
+}
+
+/* Implement TARGET_CAN_ELIMINATE.  Make sure that we're not trying
+   to eliminate to the wrong hard frame pointer.  */
+
+static bool
+loongarch_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
+{
+  return (to == HARD_FRAME_POINTER_REGNUM || to == STACK_POINTER_REGNUM);
+}
+
+/* Implement RETURN_ADDR_RTX.  We do not support moving back to a
+   previous frame.  */
+
+rtx
+loongarch_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
+{
+  if (count != 0)
+    return const0_rtx;
+
+  return get_hard_reg_initial_val (Pmode, RETURN_ADDR_REGNUM);
+}
+
+/* Emit code to change the current function's return address to
+   ADDRESS.  SCRATCH is available as a scratch register, if needed.
+   ADDRESS and SCRATCH are both word-mode GPRs.  */
+
+void
+loongarch_set_return_address (rtx address, rtx scratch)
+{
+  rtx slot_address;
+
+  gcc_assert (BITSET_P (cfun->machine->frame.mask, RETURN_ADDR_REGNUM));
+
+  if (frame_pointer_needed)
+    slot_address = loongarch_add_offset (scratch, hard_frame_pointer_rtx,
+					 -UNITS_PER_WORD);
+  else
+    slot_address = loongarch_add_offset (scratch, stack_pointer_rtx,
+					 cfun->machine->frame.gp_sp_offset);
+
+  loongarch_emit_move (gen_frame_mem (GET_MODE (address), slot_address),
+		       address);
+}
+
+/* Return true if register REGNO can store a value of mode MODE.
+   The result of this function is cached in loongarch_hard_regno_mode_ok.  */
+
+static bool
+loongarch_hard_regno_mode_ok_uncached (unsigned int regno, machine_mode mode)
+{
+  unsigned int size;
+  enum mode_class mclass;
+
+  if (mode == FCCmode)
+    return FCC_REG_P (regno);
+
+  size = GET_MODE_SIZE (mode);
+  mclass = GET_MODE_CLASS (mode);
+
+  if (GP_REG_P (regno))
+    return ((regno - GP_REG_FIRST) & 1) == 0 || size <= UNITS_PER_WORD;
+
+  if (FP_REG_P (regno))
+    {
+      if (mclass == MODE_FLOAT
+	  || mclass == MODE_COMPLEX_FLOAT
+	  || mclass == MODE_VECTOR_FLOAT)
+	return size <= UNITS_PER_FPVALUE;
+
+      /* Allow integer modes that fit into a single register.  We need
+	 to put integers into FPRs when using instructions like CVT
+	 and TRUNC.  There's no point allowing sizes smaller than a word,
+	 because the FPU has no appropriate load/store instructions.  */
+      if (mclass == MODE_INT)
+	return size >= MIN_UNITS_PER_WORD && size <= UNITS_PER_FPREG;
+    }
+
+  return false;
+}
+
+/* Implement TARGET_HARD_REGNO_MODE_OK.  */
+
+static bool
+loongarch_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
+{
+  return loongarch_hard_regno_mode_ok_p[mode][regno];
+}
+
+/* Implement TARGET_HARD_REGNO_NREGS.  */
+
+static unsigned int
+loongarch_hard_regno_nregs (unsigned int regno, machine_mode mode)
+{
+  if (FCC_REG_P (regno))
+    /* The size of FP status registers is always 4, because they only hold
+       FCCmode values, and FCCmode is always considered to be 4 bytes wide.  */
+    return (GET_MODE_SIZE (mode) + 3) / 4;
+
+  if (FP_REG_P (regno))
+    return (GET_MODE_SIZE (mode) + UNITS_PER_FPREG - 1) / UNITS_PER_FPREG;
+
+  /* All other registers are word-sized.  */
+  return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+}
+
+/* Implement CLASS_MAX_NREGS, taking the maximum of the cases
+   in loongarch_hard_regno_nregs.  */
+
+int
+loongarch_class_max_nregs (enum reg_class rclass, machine_mode mode)
+{
+  int size;
+  HARD_REG_SET left;
+
+  size = 0x8000;
+  left = reg_class_contents[rclass];
+  if (hard_reg_set_intersect_p (left, reg_class_contents[(int) FCC_REGS]))
+    {
+      if (loongarch_hard_regno_mode_ok (FCC_REG_FIRST, mode))
+	size = MIN (size, 4);
+
+      left &= ~reg_class_contents[FCC_REGS];
+    }
+  if (hard_reg_set_intersect_p (left, reg_class_contents[(int) FP_REGS]))
+    {
+      if (loongarch_hard_regno_mode_ok (FP_REG_FIRST, mode))
+	size = MIN (size, UNITS_PER_FPREG);
+
+      left &= ~reg_class_contents[FP_REGS];
+    }
+  if (!hard_reg_set_empty_p (left))
+    size = MIN (size, UNITS_PER_WORD);
+  return (GET_MODE_SIZE (mode) + size - 1) / size;
+}
+
+/* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
+
+static bool
+loongarch_can_change_mode_class (machine_mode, machine_mode,
+				 reg_class_t rclass)
+{
+  return !reg_classes_intersect_p (FP_REGS, rclass);
+}
+
+/* Return true if moves in mode MODE can use the FPU's fmov.fmt instruction,
+*/
+
+static bool
+loongarch_mode_ok_for_mov_fmt_p (machine_mode mode)
+{
+  switch (mode)
+    {
+    case E_FCCmode:
+    case E_SFmode:
+      return TARGET_HARD_FLOAT;
+
+    case E_DFmode:
+      return TARGET_HARD_FLOAT && TARGET_DOUBLE_FLOAT;
+
+    default:
+      return 0;
+    }
+}
+
+/* Implement TARGET_MODES_TIEABLE_P.  */
+
+static bool
+loongarch_modes_tieable_p (machine_mode mode1, machine_mode mode2)
+{
+  /* FPRs allow no mode punning, so it's not worth tying modes if we'd
+     prefer to put one of them in FPRs.  */
+  return (mode1 == mode2
+	  || (!loongarch_mode_ok_for_mov_fmt_p (mode1)
+	      && !loongarch_mode_ok_for_mov_fmt_p (mode2)));
+}
+
+/* Implement TARGET_PREFERRED_RELOAD_CLASS.  */
+
+static reg_class_t
+loongarch_preferred_reload_class (rtx x, reg_class_t rclass)
+{
+  if (reg_class_subset_p (FP_REGS, rclass)
+      && loongarch_mode_ok_for_mov_fmt_p (GET_MODE (x)))
+    return FP_REGS;
+
+  if (reg_class_subset_p (GR_REGS, rclass))
+    rclass = GR_REGS;
+
+  return rclass;
+}
+
+/* RCLASS is a class involved in a REGISTER_MOVE_COST calculation.
+   Return a "canonical" class to represent it in later calculations.  */
+
+static reg_class_t
+loongarch_canonicalize_move_class (reg_class_t rclass)
+{
+  if (reg_class_subset_p (rclass, GENERAL_REGS))
+    rclass = GENERAL_REGS;
+
+  return rclass;
+}
+
+/* Return the cost of moving a value from a register of class FROM to a GPR.
+   Return 0 for classes that are unions of other classes handled by this
+   function.  */
+
+static int
+loongarch_move_to_gpr_cost (reg_class_t from)
+{
+  switch (from)
+    {
+    case GENERAL_REGS:
+      /* MOVE macro.  */
+      return 2;
+
+    case FP_REGS:
+      /* MOVFR2GR, etc.  */
+      return 4;
+
+    default:
+      return 0;
+    }
+}
+
+/* Return the cost of moving a value from a GPR to a register of class TO.
+   Return 0 for classes that are unions of other classes handled by this
+   function.  */
+
+static int
+loongarch_move_from_gpr_cost (reg_class_t to)
+{
+  switch (to)
+    {
+    case GENERAL_REGS:
+      /*MOVE macro.  */
+      return 2;
+
+    case FP_REGS:
+      /* MOVGR2FR, etc.  */
+      return 4;
+
+    default:
+      return 0;
+    }
+}
+
+/* Implement TARGET_REGISTER_MOVE_COST.  Return 0 for classes that are the
+   maximum of the move costs for subclasses; regclass will work out
+   the maximum for us.  */
+
+static int
+loongarch_register_move_cost (machine_mode mode, reg_class_t from,
+			      reg_class_t to)
+{
+  reg_class_t dregs;
+  int cost1, cost2;
+
+  from = loongarch_canonicalize_move_class (from);
+  to = loongarch_canonicalize_move_class (to);
+
+  /* Handle moves that can be done without using general-purpose registers.  */
+  if (from == FP_REGS)
+    {
+      if (to == FP_REGS && loongarch_mode_ok_for_mov_fmt_p (mode))
+	/* FMOV.FMT.  */
+	return 4;
+    }
+
+  /* Handle cases in which only one class deviates from the ideal.  */
+  dregs = GENERAL_REGS;
+  if (from == dregs)
+    return loongarch_move_from_gpr_cost (to);
+  if (to == dregs)
+    return loongarch_move_to_gpr_cost (from);
+
+  /* Handles cases that require a GPR temporary.  */
+  cost1 = loongarch_move_to_gpr_cost (from);
+  if (cost1 != 0)
+    {
+      cost2 = loongarch_move_from_gpr_cost (to);
+      if (cost2 != 0)
+	return cost1 + cost2;
+    }
+
+  return 0;
+}
+
+/* Implement TARGET_MEMORY_MOVE_COST.  */
+
+static int
+loongarch_memory_move_cost (machine_mode mode, reg_class_t rclass, bool in)
+{
+  return (loongarch_cost->memory_latency
+	  + memory_move_secondary_cost (mode, rclass, in));
+}
+
+/* Return the register class required for a secondary register when
+   copying between one of the registers in RCLASS and value X, which
+   has mode MODE.  X is the source of the move if IN_P, otherwise it
+   is the destination.  Return NO_REGS if no secondary register is
+   needed.  */
+
+static reg_class_t
+loongarch_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
+			    reg_class_t rclass, machine_mode mode,
+			    secondary_reload_info *sri ATTRIBUTE_UNUSED)
+{
+  int regno;
+
+  regno = true_regnum (x);
+
+  if (reg_class_subset_p (rclass, FP_REGS))
+    {
+      if (regno < 0
+	  || (MEM_P (x)
+	      && (GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)))
+	/* In this case we can use fld.s, fst.s, fld.d or fst.d.  */
+	return NO_REGS;
+
+      if (GP_REG_P (regno) || x == CONST0_RTX (mode))
+	/* In this case we can use movgr2fr.s, movfr2gr.s, movgr2fr.d or
+	 * movfr2gr.d.  */
+	return NO_REGS;
+
+      if (CONSTANT_P (x) && !targetm.cannot_force_const_mem (mode, x))
+	/* We can force the constant to memory and use fld.s
+	   and fld.d.  As above, we will use pairs of lwc1s if
+	   ldc1 is not supported.  */
+	return NO_REGS;
+
+      if (FP_REG_P (regno) && loongarch_mode_ok_for_mov_fmt_p (mode))
+	/* In this case we can use fmov.{s/d}.  */
+	return NO_REGS;
+
+      /* Otherwise, we need to reload through an integer register.  */
+      return GR_REGS;
+    }
+  if (FP_REG_P (regno))
+    return reg_class_subset_p (rclass, GR_REGS) ? NO_REGS : GR_REGS;
+
+  return NO_REGS;
+}
+
+/* Implement TARGET_VALID_POINTER_MODE.  */
+
+static bool
+loongarch_valid_pointer_mode (scalar_int_mode mode)
+{
+  return mode == SImode || (TARGET_64BIT && mode == DImode);
+}
+
+/* Implement TARGET_SCALAR_MODE_SUPPORTED_P.  */
+
+static bool
+loongarch_scalar_mode_supported_p (scalar_mode mode)
+{
+  if (ALL_FIXED_POINT_MODE_P (mode)
+      && GET_MODE_PRECISION (mode) <= 2 * BITS_PER_WORD)
+    return true;
+
+  return default_scalar_mode_supported_p (mode);
+}
+
+/* Return the assembly code for INSN, which has the operands given by
+   OPERANDS, and which branches to OPERANDS[0] if some condition is true.
+   BRANCH_IF_TRUE is the asm template that should be used if OPERANDS[0]
+   is in range of a direct branch.  BRANCH_IF_FALSE is an inverted
+   version of BRANCH_IF_TRUE.  */
+
+const char *
+loongarch_output_conditional_branch (rtx_insn *insn, rtx *operands,
+				     const char *branch_if_true,
+				     const char *branch_if_false)
+{
+  unsigned int length;
+  rtx taken;
+
+  gcc_assert (LABEL_P (operands[0]));
+
+  length = get_attr_length (insn);
+  if (length <= 4)
+    {
+      return branch_if_true;
+    }
+
+  /* Generate a reversed branch around a direct jump.  */
+  rtx_code_label *not_taken = gen_label_rtx ();
+  taken = operands[0];
+
+  /* Generate the reversed branch to NOT_TAKEN.  */
+  operands[0] = not_taken;
+  output_asm_insn (branch_if_false, operands);
+
+  output_asm_insn ("b\t%0", &taken);
+
+  /* Output NOT_TAKEN.  */
+  targetm.asm_out.internal_label (asm_out_file, "L",
+				  CODE_LABEL_NUMBER (not_taken));
+  return "";
+}
+
+/* Return the assembly code for INSN, which branches to OPERANDS[0]
+   if some equality condition is true.  The condition is given by
+   OPERANDS[1] if !INVERTED_P, otherwise it is the inverse of
+   OPERANDS[1].  OPERANDS[2] is the comparison's first operand;
+   OPERANDS[3] is the second operand and may be zero or a register.  */
+
+const char *
+loongarch_output_equal_conditional_branch (rtx_insn *insn, rtx *operands,
+					   bool inverted_p)
+{
+  const char *branch[2];
+  if (operands[3] == const0_rtx)
+    {
+      branch[!inverted_p] = LARCH_BRANCH ("b%C1z", "%2,%0");
+      branch[inverted_p] = LARCH_BRANCH ("b%N1z", "%2,%0");
+    }
+  else
+    {
+      branch[!inverted_p] = LARCH_BRANCH ("b%C1", "%2,%z3,%0");
+      branch[inverted_p] = LARCH_BRANCH ("b%N1", "%2,%z3,%0");
+    }
+
+  return loongarch_output_conditional_branch (insn, operands, branch[1],
+					      branch[0]);
+}
+
+/* Return the assembly code for INSN, which branches to OPERANDS[0]
+   if some ordering condition is true.  The condition is given by
+   OPERANDS[1] if !INVERTED_P, otherwise it is the inverse of
+   OPERANDS[1].  OPERANDS[2] is the comparison's first operand;
+   OPERANDS[3] is the second operand and may be zero or a register.  */
+
+const char *
+loongarch_output_order_conditional_branch (rtx_insn *insn, rtx *operands,
+					   bool inverted_p)
+{
+  const char *branch[2];
+
+  /* Make BRANCH[1] branch to OPERANDS[0] when the condition is true.
+     Make BRANCH[0] branch on the inverse condition.  */
+  if (operands[3] != const0_rtx)
+    {
+      /* Handle degenerate cases that should not, but do, occur.  */
+      if (REGNO (operands[2]) == REGNO (operands[3]))
+	{
+	  switch (GET_CODE (operands[1]))
+	    {
+	    case LT:
+	    case LTU:
+	    case GT:
+	    case GTU:
+	      inverted_p = !inverted_p;
+	      /* Fall through.  */
+	    case LE:
+	    case LEU:
+	    case GE:
+	    case GEU:
+	      branch[!inverted_p] = LARCH_BRANCH ("b", "%0");
+	      branch[inverted_p] = "\t# branch never";
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+      else
+	{
+	  switch (GET_CODE (operands[1]))
+	    {
+	    case LE:
+	    case LEU:
+	    case GT:
+	    case GTU:
+	    case LT:
+	    case LTU:
+	    case GE:
+	    case GEU:
+	      branch[!inverted_p] = LARCH_BRANCH ("b%C1", "%2,%3,%0");
+	      branch[inverted_p] = LARCH_BRANCH ("b%N1", "%2,%3,%0");
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+    }
+  else
+    {
+      switch (GET_CODE (operands[1]))
+	{
+	  /* These cases are equivalent to comparisons against zero.  */
+	case LEU:
+	case GTU:
+	case LTU:
+	case GEU:
+	case LE:
+	case GT:
+	case LT:
+	case GE:
+	  branch[!inverted_p] = LARCH_BRANCH ("b%C1", "%2,$r0,%0");
+	  branch[inverted_p] = LARCH_BRANCH ("b%N1", "%2,$r0,%0");
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  return loongarch_output_conditional_branch (insn, operands, branch[1],
+					      branch[0]);
+}
+
+/* Return the assembly code for DIV.{W/D} instruction DIVISION, which has
+   the operands given by OPERANDS.  Add in a divide-by-zero check if needed.
+   */
+
+const char *
+loongarch_output_division (const char *division, rtx *operands)
+{
+  const char *s;
+
+  s = division;
+  if (TARGET_CHECK_ZERO_DIV)
+    {
+      output_asm_insn (s, operands);
+      s = "bne\t%2,%.,1f\n\tbreak\t7\n1:";
+    }
+  return s;
+}
+
+/* Implement TARGET_SCHED_ADJUST_COST.  We assume that anti and output
+   dependencies have no cost.  */
+
+static int
+loongarch_adjust_cost (rtx_insn *, int dep_type, rtx_insn *, int cost,
+		       unsigned int)
+{
+  if (dep_type != 0 && (dep_type != REG_DEP_OUTPUT))
+    return 0;
+  return cost;
+}
+
+/* Return the number of instructions that can be issued per cycle.  */
+
+static int
+loongarch_issue_rate (void)
+{
+  if ((unsigned long) LARCH_ACTUAL_TUNE < N_TUNE_TYPES)
+    return loongarch_cpu_issue_rate[LARCH_ACTUAL_TUNE];
+  else
+    return 1;
+}
+
+/* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD.  This should
+   be as wide as the scheduling freedom in the DFA.  */
+
+static int
+loongarch_multipass_dfa_lookahead (void)
+{
+  if ((unsigned long) LARCH_ACTUAL_TUNE < N_ARCH_TYPES)
+    return loongarch_cpu_multipass_dfa_lookahead[LARCH_ACTUAL_TUNE];
+  else
+    return 0;
+}
+
+/* Implement TARGET_SCHED_REORDER.  */
+
+static int
+loongarch_sched_reorder (FILE *file ATTRIBUTE_UNUSED,
+			 int verbose ATTRIBUTE_UNUSED,
+			 rtx_insn **ready ATTRIBUTE_UNUSED,
+			 int *nreadyp ATTRIBUTE_UNUSED,
+			 int cycle ATTRIBUTE_UNUSED)
+{
+  return loongarch_issue_rate ();
+}
+
+/* Implement TARGET_SCHED_REORDER2.  */
+
+static int
+loongarch_sched_reorder2 (FILE *file ATTRIBUTE_UNUSED,
+			  int verbose ATTRIBUTE_UNUSED,
+			  rtx_insn **ready ATTRIBUTE_UNUSED,
+			  int *nreadyp ATTRIBUTE_UNUSED,
+			  int cycle ATTRIBUTE_UNUSED)
+{
+  return cached_can_issue_more;
+}
+
+/* Implement TARGET_SCHED_INIT.  */
+
+static void
+loongarch_sched_init (FILE *file ATTRIBUTE_UNUSED,
+		      int verbose ATTRIBUTE_UNUSED,
+		      int max_ready ATTRIBUTE_UNUSED)
+{}
+
+/* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
+
+static int
+loongarch_variable_issue (FILE *file ATTRIBUTE_UNUSED,
+			  int verbose ATTRIBUTE_UNUSED, rtx_insn *insn,
+			  int more)
+{
+  /* Ignore USEs and CLOBBERs; don't count them against the issue rate.  */
+  if (USEFUL_INSN_P (insn))
+    {
+      if (get_attr_type (insn) != TYPE_GHOST)
+	more--;
+    }
+
+  /* Instructions of type 'multi' should all be split before
+     the second scheduling pass.  */
+  gcc_assert (!reload_completed
+	      || recog_memoized (insn) < 0
+	      || get_attr_type (insn) != TYPE_MULTI);
+
+  cached_can_issue_more = more;
+  return more;
+}
+
+/* Given that we have an rtx of the form (prefetch ... WRITE LOCALITY),
+   return the first operand of the associated PREF or PREFX insn.  */
+
+rtx
+loongarch_prefetch_cookie (rtx write, rtx locality)
+{
+  /* store_streamed / load_streamed.  */
+  if (INTVAL (locality) <= 0)
+    return GEN_INT (INTVAL (write) + 4);
+
+  /* store / load.  */
+  if (INTVAL (locality) <= 2)
+    return write;
+
+  /* store_retained / load_retained.  */
+  return GEN_INT (INTVAL (write) + 6);
+}
+
+/* Implement TARGET_ASM_OUTPUT_MI_THUNK.  Generate rtl rather than asm text
+   in order to avoid duplicating too much logic from elsewhere.  */
+
+static void
+loongarch_output_mi_thunk (FILE *file, tree thunk_fndecl ATTRIBUTE_UNUSED,
+			   HOST_WIDE_INT delta, HOST_WIDE_INT vcall_offset,
+			   tree function)
+{
+  const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk_fndecl));
+  rtx this_rtx, temp1, temp2, fnaddr;
+  rtx_insn *insn;
+  bool use_sibcall_p;
+
+  /* Pretend to be a post-reload pass while generating rtl.  */
+  reload_completed = 1;
+
+  /* Mark the end of the (empty) prologue.  */
+  emit_note (NOTE_INSN_PROLOGUE_END);
+
+  /* Determine if we can use a sibcall to call FUNCTION directly.  */
+  fnaddr = XEXP (DECL_RTL (function), 0);
+  use_sibcall_p = const_call_insn_operand (fnaddr, Pmode);
+
+  /* We need two temporary registers in some cases.  */
+  temp1 = gen_rtx_REG (Pmode, 12);
+  temp2 = gen_rtx_REG (Pmode, 13);
+
+  /* Find out which register contains the "this" pointer.  */
+  if (aggregate_value_p (TREE_TYPE (TREE_TYPE (function)), function))
+    this_rtx = gen_rtx_REG (Pmode, GP_ARG_FIRST + 1);
+  else
+    this_rtx = gen_rtx_REG (Pmode, GP_ARG_FIRST);
+
+  /* Add DELTA to THIS_RTX.  */
+  if (delta != 0)
+    {
+      rtx offset = GEN_INT (delta);
+      if (!IMM12_OPERAND (delta))
+	{
+	  loongarch_emit_move (temp1, offset);
+	  offset = temp1;
+	}
+      emit_insn (gen_add3_insn (this_rtx, this_rtx, offset));
+    }
+
+  /* If needed, add *(*THIS_RTX + VCALL_OFFSET) to THIS_RTX.  */
+  if (vcall_offset != 0)
+    {
+      rtx addr;
+
+      /* Set TEMP1 to *THIS_RTX.  */
+      loongarch_emit_move (temp1, gen_rtx_MEM (Pmode, this_rtx));
+
+      /* Set ADDR to a legitimate address for *THIS_RTX + VCALL_OFFSET.  */
+      addr = loongarch_add_offset (temp2, temp1, vcall_offset);
+
+      /* Load the offset and add it to THIS_RTX.  */
+      loongarch_emit_move (temp1, gen_rtx_MEM (Pmode, addr));
+      emit_insn (gen_add3_insn (this_rtx, this_rtx, temp1));
+    }
+
+  /* Jump to the target function.  Use a sibcall if direct jumps are
+     allowed, otherwise load the address into a register first.  */
+  if (use_sibcall_p)
+    {
+      insn = emit_call_insn (gen_sibcall_internal (fnaddr, const0_rtx));
+      SIBLING_CALL_P (insn) = 1;
+    }
+  else
+    {
+      loongarch_emit_move (temp1, fnaddr);
+      emit_jump_insn (gen_indirect_jump (temp1));
+    }
+
+  /* Run just enough of rest_of_compilation.  This sequence was
+     "borrowed" from alpha.c.  */
+  insn = get_insns ();
+  split_all_insns_noflow ();
+  shorten_branches (insn);
+  assemble_start_function (thunk_fndecl, fnname);
+  final_start_function (insn, file, 1);
+  final (insn, file, 1);
+  final_end_function ();
+  assemble_end_function (thunk_fndecl, fnname);
+
+  /* Stop pretending to be a post-reload pass.  */
+  reload_completed = 0;
+}
+
+/* Allocate a chunk of memory for per-function machine-dependent data.  */
+
+static struct machine_function *
+loongarch_init_machine_status (void)
+{
+  return ggc_cleared_alloc<machine_function> ();
+}
+
+static void
+loongarch_option_override_internal (struct gcc_options *opts)
+{
+  int i, regno, mode;
+
+  if (flag_pic)
+    g_switch_value = 0;
+
+  /* Handle target-specific options: compute defaults/conflicts etc.  */
+  loongarch_config_target (&la_target, la_opt_switches,
+			   la_opt_cpu_arch, la_opt_cpu_tune, la_opt_fpu,
+			   la_opt_abi_base, la_opt_abi_ext, la_opt_cmodel, 0);
+
+  if (TARGET_ABI_LP64)
+    flag_pcc_struct_return = 0;
+
+  /* Decide which rtx_costs structure to use.  */
+  if (optimize_size)
+    loongarch_cost = &loongarch_rtx_cost_optimize_size;
+  else
+    loongarch_cost = &loongarch_cpu_rtx_cost_data[LARCH_ACTUAL_TUNE];
+
+  /* If the user hasn't specified a branch cost, use the processor's
+     default.  */
+  if (loongarch_branch_cost == 0)
+    loongarch_branch_cost = loongarch_cost->branch_cost;
+
+
+  switch (la_target.cmodel)
+    {
+      case CMODEL_TINY_STATIC:
+      case CMODEL_EXTREME:
+	if (opts->x_flag_plt)
+	  error ("code model %qs and %qs not support %s mode",
+		 "tiny-static", "extreme", "plt");
+	break;
+
+      case CMODEL_NORMAL:
+      case CMODEL_TINY:
+      case CMODEL_LARGE:
+	break;
+
+      default:
+	gcc_unreachable ();
+    }
+
+  loongarch_init_print_operand_punct ();
+
+  /* Set up array to map GCC register number to debug register number.
+     Ignore the special purpose register numbers.  */
+
+  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+    {
+      if (GP_REG_P (i) || FP_REG_P (i))
+	loongarch_dwarf_regno[i] = i;
+      else
+	loongarch_dwarf_regno[i] = INVALID_REGNUM;
+    }
+
+  /* Set up loongarch_hard_regno_mode_ok.  */
+  for (mode = 0; mode < MAX_MACHINE_MODE; mode++)
+    for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+      loongarch_hard_regno_mode_ok_p[mode][regno]
+	= loongarch_hard_regno_mode_ok_uncached (regno, (machine_mode) mode);
+
+  /* Function to allocate machine-dependent function status.  */
+  init_machine_status = &loongarch_init_machine_status;
+}
+
+
+/* Implement TARGET_OPTION_OVERRIDE.  */
+
+static void
+loongarch_option_override (void)
+{
+  loongarch_option_override_internal (&global_options);
+}
+
+/* Implement TARGET_CONDITIONAL_REGISTER_USAGE.  */
+
+static void
+loongarch_conditional_register_usage (void)
+{
+  if (!TARGET_HARD_FLOAT)
+    accessible_reg_set &= ~(reg_class_contents[FP_REGS]
+			    | reg_class_contents[FCC_REGS]);
+}
+
+/* Implement EH_USES.  */
+
+bool
+loongarch_eh_uses (unsigned int regno ATTRIBUTE_UNUSED)
+{
+  return false;
+}
+
+/* Implement EPILOGUE_USES.  */
+
+bool
+loongarch_epilogue_uses (unsigned int regno)
+{
+  /* Say that the epilogue uses the return address register.  Note that
+     in the case of sibcalls, the values "used by the epilogue" are
+     considered live at the start of the called function.  */
+  if (regno == RETURN_ADDR_REGNUM)
+    return true;
+
+  return false;
+}
+
+bool
+loongarch_load_store_bonding_p (rtx *operands, machine_mode mode, bool load_p)
+{
+  rtx reg1, reg2, mem1, mem2, base1, base2;
+  enum reg_class rc1, rc2;
+  HOST_WIDE_INT offset1, offset2;
+
+  if (load_p)
+    {
+      reg1 = operands[0];
+      reg2 = operands[2];
+      mem1 = operands[1];
+      mem2 = operands[3];
+    }
+  else
+    {
+      reg1 = operands[1];
+      reg2 = operands[3];
+      mem1 = operands[0];
+      mem2 = operands[2];
+    }
+
+  if (loongarch_address_insns (XEXP (mem1, 0), mode, false) == 0
+      || loongarch_address_insns (XEXP (mem2, 0), mode, false) == 0)
+    return false;
+
+  loongarch_split_plus (XEXP (mem1, 0), &base1, &offset1);
+  loongarch_split_plus (XEXP (mem2, 0), &base2, &offset2);
+
+  /* Base regs do not match.  */
+  if (!REG_P (base1) || !rtx_equal_p (base1, base2))
+    return false;
+
+  /* Either of the loads is clobbering base register.  It is legitimate to bond
+     loads if second load clobbers base register.  However, hardware does not
+     support such bonding.  */
+  if (load_p
+      && (REGNO (reg1) == REGNO (base1) || (REGNO (reg2) == REGNO (base1))))
+    return false;
+
+  /* Loading in same registers.  */
+  if (load_p && REGNO (reg1) == REGNO (reg2))
+    return false;
+
+  /* The loads/stores are not of same type.  */
+  rc1 = REGNO_REG_CLASS (REGNO (reg1));
+  rc2 = REGNO_REG_CLASS (REGNO (reg2));
+  if (rc1 != rc2 && !reg_class_subset_p (rc1, rc2)
+      && !reg_class_subset_p (rc2, rc1))
+    return false;
+
+  if (abs (offset1 - offset2) != GET_MODE_SIZE (mode))
+    return false;
+
+  return true;
+}
+
+/* Implement TARGET_TRAMPOLINE_INIT.  */
+
+static void
+loongarch_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
+{
+  rtx addr, end_addr, mem;
+  rtx trampoline[8];
+  unsigned int i, j;
+  HOST_WIDE_INT end_addr_offset, static_chain_offset, target_function_offset;
+
+  /* Work out the offsets of the pointers from the start of the
+     trampoline code.  */
+  end_addr_offset = TRAMPOLINE_CODE_SIZE;
+  static_chain_offset = end_addr_offset;
+  target_function_offset = static_chain_offset + GET_MODE_SIZE (ptr_mode);
+
+  /* Get pointers to the beginning and end of the code block.  */
+  addr = force_reg (Pmode, XEXP (m_tramp, 0));
+  end_addr
+    = loongarch_force_binary (Pmode, PLUS, addr, GEN_INT (end_addr_offset));
+
+#define OP(X) gen_int_mode (X, SImode)
+
+  /* Build up the code in TRAMPOLINE.  */
+  i = 0;
+  /*pcaddi $static_chain,0
+    ld.[dw] $tmp,$static_chain,target_function_offset
+    ld.[dw] $static_chain,$static_chain,static_chain_offset
+    jirl $r0,$tmp,0  */
+  trampoline[i++] = OP (0x18000000 | (STATIC_CHAIN_REGNUM - GP_REG_FIRST));
+  trampoline[i++] = OP ((ptr_mode == DImode ? 0x28c00000 : 0x28800000)
+			| 19 /* $t7  */
+			| ((STATIC_CHAIN_REGNUM - GP_REG_FIRST) << 5)
+			| ((target_function_offset & 0xfff) << 10));
+  trampoline[i++] = OP ((ptr_mode == DImode ? 0x28c00000 : 0x28800000)
+			| (STATIC_CHAIN_REGNUM - GP_REG_FIRST)
+			| ((STATIC_CHAIN_REGNUM - GP_REG_FIRST) << 5)
+			| ((static_chain_offset & 0xfff) << 10));
+  trampoline[i++] = OP (0x4c000000 | (19 << 5));
+#undef OP
+
+  for (j = 0; j < i; j++)
+   {
+     mem = adjust_address (m_tramp, SImode, j * GET_MODE_SIZE (SImode));
+     loongarch_emit_move (mem, trampoline[j]);
+   }
+
+  /* Set up the static chain pointer field.  */
+  mem = adjust_address (m_tramp, ptr_mode, static_chain_offset);
+  loongarch_emit_move (mem, chain_value);
+
+  /* Set up the target function field.  */
+  mem = adjust_address (m_tramp, ptr_mode, target_function_offset);
+  loongarch_emit_move (mem, XEXP (DECL_RTL (fndecl), 0));
+
+  /* Flush the code part of the trampoline.  */
+  emit_insn (gen_add3_insn (end_addr, addr, GEN_INT (TRAMPOLINE_SIZE)));
+  emit_insn (gen_clear_cache (addr, end_addr));
+}
+
+/* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
+
+machine_mode
+loongarch_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
+				       machine_mode mode)
+{
+  /* For performance, avoid saving/restoring upper parts of a register
+     by returning MODE as save mode when the mode is known.  */
+  if (mode == VOIDmode)
+    return choose_hard_reg_mode (regno, nregs, NULL);
+  else
+    return mode;
+}
+
+/* Implement TARGET_SPILL_CLASS.  */
+
+static reg_class_t
+loongarch_spill_class (reg_class_t rclass ATTRIBUTE_UNUSED,
+		       machine_mode mode ATTRIBUTE_UNUSED)
+{
+  return NO_REGS;
+}
+
+/* Implement TARGET_PROMOTE_FUNCTION_MODE.  */
+
+/* This function is equivalent to default_promote_function_mode_always_promote
+   except that it returns a promoted mode even if type is NULL_TREE.  This is
+   needed by libcalls which have no type (only a mode) such as fixed conversion
+   routines that take a signed or unsigned char/short argument and convert it
+   to a fixed type.  */
+
+static machine_mode
+loongarch_promote_function_mode (const_tree type ATTRIBUTE_UNUSED,
+				 machine_mode mode,
+				 int *punsignedp ATTRIBUTE_UNUSED,
+				 const_tree fntype ATTRIBUTE_UNUSED,
+				 int for_return ATTRIBUTE_UNUSED)
+{
+  int unsignedp;
+
+  if (type != NULL_TREE)
+    return promote_mode (type, mode, punsignedp);
+
+  unsignedp = *punsignedp;
+  PROMOTE_MODE (mode, unsignedp, type);
+  *punsignedp = unsignedp;
+  return mode;
+}
+
+/* Implement TARGET_STARTING_FRAME_OFFSET.  See loongarch_compute_frame_info
+   for details about the frame layout.  */
+
+static HOST_WIDE_INT
+loongarch_starting_frame_offset (void)
+{
+  if (FRAME_GROWS_DOWNWARD)
+    return 0;
+  return crtl->outgoing_args_size;
+}
+
+/* Initialize the GCC target structure.  */
+#undef TARGET_ASM_ALIGNED_HI_OP
+#define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
+#undef TARGET_ASM_ALIGNED_SI_OP
+#define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
+#undef TARGET_ASM_ALIGNED_DI_OP
+#define TARGET_ASM_ALIGNED_DI_OP "\t.dword\t"
+
+#undef TARGET_OPTION_OVERRIDE
+#define TARGET_OPTION_OVERRIDE loongarch_option_override
+
+#undef TARGET_LEGITIMIZE_ADDRESS
+#define TARGET_LEGITIMIZE_ADDRESS loongarch_legitimize_address
+
+#undef TARGET_ASM_SELECT_RTX_SECTION
+#define TARGET_ASM_SELECT_RTX_SECTION loongarch_select_rtx_section
+#undef TARGET_ASM_FUNCTION_RODATA_SECTION
+#define TARGET_ASM_FUNCTION_RODATA_SECTION loongarch_function_rodata_section
+
+#undef TARGET_SCHED_INIT
+#define TARGET_SCHED_INIT loongarch_sched_init
+#undef TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER loongarch_sched_reorder
+#undef TARGET_SCHED_REORDER2
+#define TARGET_SCHED_REORDER2 loongarch_sched_reorder2
+#undef TARGET_SCHED_VARIABLE_ISSUE
+#define TARGET_SCHED_VARIABLE_ISSUE loongarch_variable_issue
+#undef TARGET_SCHED_ADJUST_COST
+#define TARGET_SCHED_ADJUST_COST loongarch_adjust_cost
+#undef TARGET_SCHED_ISSUE_RATE
+#define TARGET_SCHED_ISSUE_RATE loongarch_issue_rate
+#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
+#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
+  loongarch_multipass_dfa_lookahead
+
+#undef TARGET_FUNCTION_OK_FOR_SIBCALL
+#define TARGET_FUNCTION_OK_FOR_SIBCALL loongarch_function_ok_for_sibcall
+
+#undef TARGET_VALID_POINTER_MODE
+#define TARGET_VALID_POINTER_MODE loongarch_valid_pointer_mode
+#undef TARGET_REGISTER_MOVE_COST
+#define TARGET_REGISTER_MOVE_COST loongarch_register_move_cost
+#undef TARGET_MEMORY_MOVE_COST
+#define TARGET_MEMORY_MOVE_COST loongarch_memory_move_cost
+#undef TARGET_RTX_COSTS
+#define TARGET_RTX_COSTS loongarch_rtx_costs
+#undef TARGET_ADDRESS_COST
+#define TARGET_ADDRESS_COST loongarch_address_cost
+
+#undef TARGET_IN_SMALL_DATA_P
+#define TARGET_IN_SMALL_DATA_P loongarch_in_small_data_p
+
+#undef TARGET_PREFERRED_RELOAD_CLASS
+#define TARGET_PREFERRED_RELOAD_CLASS loongarch_preferred_reload_class
+
+#undef TARGET_ASM_FILE_START_FILE_DIRECTIVE
+#define TARGET_ASM_FILE_START_FILE_DIRECTIVE true
+
+#undef TARGET_EXPAND_BUILTIN_VA_START
+#define TARGET_EXPAND_BUILTIN_VA_START loongarch_va_start
+
+#undef TARGET_PROMOTE_FUNCTION_MODE
+#define TARGET_PROMOTE_FUNCTION_MODE loongarch_promote_function_mode
+#undef TARGET_RETURN_IN_MEMORY
+#define TARGET_RETURN_IN_MEMORY loongarch_return_in_memory
+
+#undef TARGET_FUNCTION_VALUE
+#define TARGET_FUNCTION_VALUE loongarch_function_value
+#undef TARGET_LIBCALL_VALUE
+#define TARGET_LIBCALL_VALUE loongarch_libcall_value
+
+#undef TARGET_ASM_OUTPUT_MI_THUNK
+#define TARGET_ASM_OUTPUT_MI_THUNK loongarch_output_mi_thunk
+#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
+#define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
+  hook_bool_const_tree_hwi_hwi_const_tree_true
+
+#undef TARGET_PRINT_OPERAND
+#define TARGET_PRINT_OPERAND loongarch_print_operand
+#undef TARGET_PRINT_OPERAND_ADDRESS
+#define TARGET_PRINT_OPERAND_ADDRESS loongarch_print_operand_address
+#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
+#define TARGET_PRINT_OPERAND_PUNCT_VALID_P \
+  loongarch_print_operand_punct_valid_p
+
+#undef TARGET_SETUP_INCOMING_VARARGS
+#define TARGET_SETUP_INCOMING_VARARGS loongarch_setup_incoming_varargs
+#undef TARGET_STRICT_ARGUMENT_NAMING
+#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
+#undef TARGET_MUST_PASS_IN_STACK
+#define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
+#undef TARGET_PASS_BY_REFERENCE
+#define TARGET_PASS_BY_REFERENCE loongarch_pass_by_reference
+#undef TARGET_ARG_PARTIAL_BYTES
+#define TARGET_ARG_PARTIAL_BYTES loongarch_arg_partial_bytes
+#undef TARGET_FUNCTION_ARG
+#define TARGET_FUNCTION_ARG loongarch_function_arg
+#undef TARGET_FUNCTION_ARG_ADVANCE
+#define TARGET_FUNCTION_ARG_ADVANCE loongarch_function_arg_advance
+#undef TARGET_FUNCTION_ARG_BOUNDARY
+#define TARGET_FUNCTION_ARG_BOUNDARY loongarch_function_arg_boundary
+
+#undef TARGET_SCALAR_MODE_SUPPORTED_P
+#define TARGET_SCALAR_MODE_SUPPORTED_P loongarch_scalar_mode_supported_p
+
+#undef TARGET_INIT_BUILTINS
+#define TARGET_INIT_BUILTINS loongarch_init_builtins
+#undef TARGET_BUILTIN_DECL
+#define TARGET_BUILTIN_DECL loongarch_builtin_decl
+#undef TARGET_EXPAND_BUILTIN
+#define TARGET_EXPAND_BUILTIN loongarch_expand_builtin
+
+/* The generic ELF target does not always have TLS support.  */
+#ifdef HAVE_AS_TLS
+#undef TARGET_HAVE_TLS
+#define TARGET_HAVE_TLS HAVE_AS_TLS
+#endif
+
+#undef TARGET_CANNOT_FORCE_CONST_MEM
+#define TARGET_CANNOT_FORCE_CONST_MEM loongarch_cannot_force_const_mem
+
+#undef TARGET_LEGITIMATE_CONSTANT_P
+#define TARGET_LEGITIMATE_CONSTANT_P loongarch_legitimate_constant_p
+
+#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
+#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
+
+#ifdef HAVE_AS_DTPRELWORD
+#undef TARGET_ASM_OUTPUT_DWARF_DTPREL
+#define TARGET_ASM_OUTPUT_DWARF_DTPREL loongarch_output_dwarf_dtprel
+#endif
+
+#undef TARGET_LEGITIMATE_ADDRESS_P
+#define TARGET_LEGITIMATE_ADDRESS_P loongarch_legitimate_address_p
+
+#undef TARGET_FRAME_POINTER_REQUIRED
+#define TARGET_FRAME_POINTER_REQUIRED loongarch_frame_pointer_required
+
+#undef TARGET_CAN_ELIMINATE
+#define TARGET_CAN_ELIMINATE loongarch_can_eliminate
+
+#undef TARGET_CONDITIONAL_REGISTER_USAGE
+#define TARGET_CONDITIONAL_REGISTER_USAGE loongarch_conditional_register_usage
+
+#undef TARGET_TRAMPOLINE_INIT
+#define TARGET_TRAMPOLINE_INIT loongarch_trampoline_init
+
+#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
+#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV loongarch_atomic_assign_expand_fenv
+
+#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
+#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
+
+#undef TARGET_SPILL_CLASS
+#define TARGET_SPILL_CLASS loongarch_spill_class
+
+#undef TARGET_HARD_REGNO_NREGS
+#define TARGET_HARD_REGNO_NREGS loongarch_hard_regno_nregs
+#undef TARGET_HARD_REGNO_MODE_OK
+#define TARGET_HARD_REGNO_MODE_OK loongarch_hard_regno_mode_ok
+
+#undef TARGET_MODES_TIEABLE_P
+#define TARGET_MODES_TIEABLE_P loongarch_modes_tieable_p
+
+#undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
+#define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 2
+
+#undef TARGET_CAN_CHANGE_MODE_CLASS
+#define TARGET_CAN_CHANGE_MODE_CLASS loongarch_can_change_mode_class
+
+#undef TARGET_CONSTANT_ALIGNMENT
+#define TARGET_CONSTANT_ALIGNMENT loongarch_constant_alignment
+
+#undef TARGET_STARTING_FRAME_OFFSET
+#define TARGET_STARTING_FRAME_OFFSET loongarch_starting_frame_offset
+
+#undef TARGET_SECONDARY_RELOAD
+#define TARGET_SECONDARY_RELOAD loongarch_secondary_reload
+
+#undef  TARGET_HAVE_SPECULATION_SAFE_VALUE
+#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
+
+struct gcc_target targetm = TARGET_INITIALIZER;
+
+#include "gt-loongarch.h"
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
new file mode 100644
index 0000000..4d107a4
--- /dev/null
+++ b/gcc/config/loongarch/loongarch.h
@@ -0,0 +1,1147 @@
+/* Definitions of target machine for GNU compiler.  LoongArch version.
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   Contributed by Loongson Ltd.
+   Based on MIPS and RISC-V target for GNU compiler.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* LoongArch external variables defined in loongarch.cc.  */
+
+#include "config/loongarch/loongarch-opts.h"
+
+/* Macros to silence warnings about numbers being signed in traditional
+   C and unsigned in ISO C when compiled on 32-bit hosts.  */
+
+#define BITMASK_HIGH (((unsigned long) 1) << 31) /* 0x80000000  */
+
+/* Run-time compilation parameters selecting different hardware subsets.  */
+
+/* Target CPU builtins.  */
+#define TARGET_CPU_CPP_BUILTINS() loongarch_cpu_cpp_builtins (pfile)
+
+/* Default target_flags if no switches are specified.  */
+
+#ifdef IN_LIBGCC2
+#undef TARGET_64BIT
+/* Make this compile time constant for libgcc2.  */
+#ifdef __loongarch64
+#define TARGET_64BIT 1
+#else
+#define TARGET_64BIT 0
+#endif
+#endif /* IN_LIBGCC2  */
+
+#define TARGET_LIBGCC_SDATA_SECTION ".sdata"
+
+/* Driver native functions for SPEC processing in the GCC driver.  */
+#include "loongarch-driver.h"
+
+/* This definition replaces the formerly used 'm' constraint with a
+   different constraint letter in order to avoid changing semantics of
+   the 'm' constraint when accepting new address formats in
+   TARGET_LEGITIMATE_ADDRESS_P.  The constraint letter defined here
+   must not be used in insn definitions or inline assemblies.  */
+#define TARGET_MEM_CONSTRAINT 'w'
+
+/* Tell collect what flags to pass to nm.  */
+#ifndef NM_FLAGS
+#define NM_FLAGS "-Bn"
+#endif
+
+/* SUBTARGET_ASM_SPEC is always passed to the assembler.  It may be
+   overridden by subtargets.  */
+
+#ifndef SUBTARGET_ASM_SPEC
+#define SUBTARGET_ASM_SPEC ""
+#endif
+
+#undef ASM_SPEC
+#define ASM_SPEC "%{mabi=*} %{subtarget_asm_spec}"
+
+/* Extra switches sometimes passed to the linker.  */
+
+#ifndef LINK_SPEC
+#define LINK_SPEC ""
+#endif /* LINK_SPEC defined  */
+
+/* Specs for the compiler proper.  */
+
+/* CC1_SPEC is the set of arguments to pass to the compiler proper.  */
+
+#undef CC1_SPEC
+#define CC1_SPEC "\
+%{G*} \
+%(subtarget_cc1_spec)"
+
+/* Preprocessor specs.  */
+
+/* SUBTARGET_CPP_SPEC is passed to the preprocessor.  It may be
+   overridden by subtargets.  */
+#ifndef SUBTARGET_CPP_SPEC
+#define SUBTARGET_CPP_SPEC ""
+#endif
+
+#define CPP_SPEC "%(subtarget_cpp_spec)"
+
+/* This macro defines names of additional specifications to put in the specs
+   that can be used in various specifications like CC1_SPEC.  Its definition
+   is an initializer with a subgrouping for each command option.
+
+   Each subgrouping contains a string constant, that defines the
+   specification name, and a string constant that used by the GCC driver
+   program.
+
+   Do not define this macro if it does not need to do anything.  */
+
+#define EXTRA_SPECS \
+  {"subtarget_cc1_spec", SUBTARGET_CC1_SPEC}, \
+  {"subtarget_cpp_spec", SUBTARGET_CPP_SPEC}, \
+  {"subtarget_asm_spec", SUBTARGET_ASM_SPEC},
+
+/* Registers may have a prefix which can be ignored when matching
+   user asm and register definitions.  */
+#ifndef REGISTER_PREFIX
+#define REGISTER_PREFIX "$"
+#endif
+
+/* Local compiler-generated symbols must have a prefix that the assembler
+   understands.  */
+
+#define LOCAL_LABEL_PREFIX "."
+
+/* By default on the loongarch, external symbols do not have an underscore
+   prepended.  */
+
+#define USER_LABEL_PREFIX ""
+
+#ifndef PREFERRED_DEBUGGING_TYPE
+#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG
+#endif
+
+/* The size of DWARF addresses should be the same as the size of symbols
+   in the target file format.  */
+#define DWARF2_ADDR_SIZE (TARGET_64BIT ? 8 : 4)
+
+/* By default, turn on GDB extensions.  */
+#define DEFAULT_GDB_EXTENSIONS 1
+
+/* By default, produce dwarf version 2 format debugging output in response
+   to the ‘-g’ option.  */
+#define DWARF2_DEBUGGING_INFO 1
+
+/* The mapping from gcc register number to DWARF 2 CFA column number.  */
+#define DWARF_FRAME_REGNUM(REGNO) loongarch_dwarf_regno[REGNO]
+
+/* The DWARF 2 CFA column which tracks the return address.  */
+#define DWARF_FRAME_RETURN_COLUMN RETURN_ADDR_REGNUM
+
+/* Before the prologue, RA lives in r1.  */
+#define INCOMING_RETURN_ADDR_RTX gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM)
+
+/* Describe how we implement __builtin_eh_return.  */
+#define EH_RETURN_DATA_REGNO(N) \
+  ((N) < (4) ? (N) + GP_ARG_FIRST : INVALID_REGNUM)
+
+#define EH_RETURN_STACKADJ_RTX gen_rtx_REG (Pmode, GP_ARG_FIRST + 4)
+
+#define EH_USES(N) loongarch_eh_uses (N)
+
+/* Offsets recorded in opcodes are a multiple of this alignment factor.
+   The default for this in 64-bit mode is 8, which causes problems with
+   SFmode register saves.  */
+#define DWARF_CIE_DATA_ALIGNMENT -4
+
+/* Target machine storage layout.  */
+
+#define BITS_BIG_ENDIAN 0
+#define BYTES_BIG_ENDIAN 0
+#define WORDS_BIG_ENDIAN 0
+
+#define MAX_BITS_PER_WORD 64
+
+/* Width of a word, in units (bytes).  */
+#define UNITS_PER_WORD (TARGET_64BIT ? 8 : 4)
+#ifndef IN_LIBGCC2
+#define MIN_UNITS_PER_WORD 4
+#endif
+
+/* For LARCH, width of a floating point register.  */
+#define UNITS_PER_FPREG (TARGET_DOUBLE_FLOAT ? 8 : 4)
+
+/* The largest size of value that can be held in floating-point
+   registers and moved with a single instruction.  */
+#define UNITS_PER_HWFPVALUE \
+  (TARGET_SOFT_FLOAT ? 0 : UNITS_PER_FPREG)
+
+/* The largest size of value that can be held in floating-point
+   registers.  */
+#define UNITS_PER_FPVALUE \
+  (TARGET_SOFT_FLOAT ? 0 \
+   : TARGET_SINGLE_FLOAT ? UNITS_PER_FPREG \
+			 : LONG_DOUBLE_TYPE_SIZE / BITS_PER_UNIT)
+
+/* The number of bytes in a double.  */
+#define UNITS_PER_DOUBLE (TYPE_PRECISION (double_type_node) / BITS_PER_UNIT)
+
+/* Set the sizes of the core types.  */
+#define SHORT_TYPE_SIZE 16
+#define INT_TYPE_SIZE 32
+#define LONG_TYPE_SIZE (TARGET_64BIT ? 64 : 32)
+#define LONG_LONG_TYPE_SIZE 64
+
+#define FLOAT_TYPE_SIZE 32
+#define DOUBLE_TYPE_SIZE 64
+#define LONG_DOUBLE_TYPE_SIZE (TARGET_64BIT ? 128 : 64)
+
+/* Define the sizes of fixed-point types.  */
+#define SHORT_FRACT_TYPE_SIZE 8
+#define FRACT_TYPE_SIZE 16
+#define LONG_FRACT_TYPE_SIZE 32
+#define LONG_LONG_FRACT_TYPE_SIZE 64
+
+#define SHORT_ACCUM_TYPE_SIZE 16
+#define ACCUM_TYPE_SIZE 32
+#define LONG_ACCUM_TYPE_SIZE 64
+#define LONG_LONG_ACCUM_TYPE_SIZE (TARGET_64BIT ? 128 : 64)
+
+/* long double is not a fixed mode, but the idea is that, if we
+   support long double, we also want a 128-bit integer type.  */
+#define MAX_FIXED_MODE_SIZE LONG_DOUBLE_TYPE_SIZE
+
+/* Width in bits of a pointer.  */
+#ifndef POINTER_SIZE
+#define POINTER_SIZE (TARGET_64BIT ? 64 : 32)
+#endif
+
+/* Allocation boundary (in *bits*) for storing arguments in argument list.  */
+#define PARM_BOUNDARY BITS_PER_WORD
+
+/* Allocation boundary (in *bits*) for the code of a function.  */
+#define FUNCTION_BOUNDARY 32
+
+/* Alignment of field after `int : 0' in a structure.  */
+#define EMPTY_FIELD_BOUNDARY 32
+
+/* Number of bits which any structure or union's size must be a multiple of.
+   Each structure or union's size is rounded up to a multiple of this.  */
+#define STRUCTURE_SIZE_BOUNDARY 8
+
+/* There is no point aligning anything to a rounder boundary than
+   LONG_DOUBLE_TYPE_SIZE.  */
+#define BIGGEST_ALIGNMENT (LONG_DOUBLE_TYPE_SIZE)
+
+/* All accesses must be aligned.  */
+#define STRICT_ALIGNMENT (TARGET_STRICT_ALIGN)
+
+/* Define this if you wish to imitate the way many other C compilers
+   handle alignment of bitfields and the structures that contain
+   them.
+
+   The behavior is that the type written for a bit-field (`int',
+   `short', or other integer type) imposes an alignment for the
+   entire structure, as if the structure really did contain an
+   ordinary field of that type.  In addition, the bit-field is placed
+   within the structure so that it would fit within such a field,
+   not crossing a boundary for it.
+
+   Thus, on most machines, a bit-field whose type is written as `int'
+   would not cross a four-byte boundary, and would force four-byte
+   alignment for the whole structure.  (The alignment used may not
+   be four bytes; it is controlled by the other alignment
+   parameters.)
+
+   If the macro is defined, its definition should be a C expression;
+   a nonzero value for the expression enables this behavior.  */
+
+#define PCC_BITFIELD_TYPE_MATTERS 1
+
+/* If defined, a C expression to compute the alignment for a static
+   variable.  TYPE is the data type, and ALIGN is the alignment that
+   the object would ordinarily have.  The value of this macro is used
+   instead of that alignment to align the object.
+
+   If this macro is not defined, then ALIGN is used.
+
+   One use of this macro is to increase alignment of medium-size
+   data to make it all fit in fewer cache lines.  Another is to
+   cause character arrays to be word-aligned so that `strcpy' calls
+   that copy constants to character arrays can be done inline.  */
+
+#undef DATA_ALIGNMENT
+#define DATA_ALIGNMENT(TYPE, ALIGN)					\
+  ((((ALIGN) < BITS_PER_WORD)						\
+    && (TREE_CODE (TYPE) == ARRAY_TYPE					\
+	|| TREE_CODE (TYPE) == UNION_TYPE				\
+	|| TREE_CODE (TYPE) == RECORD_TYPE)) ? BITS_PER_WORD : (ALIGN))
+
+/* We need this for the same reason as DATA_ALIGNMENT, namely to cause
+   character arrays to be word-aligned so that `strcpy' calls that copy
+   constants to character arrays can be done inline, and 'strcmp' can be
+   optimised to use word loads.  */
+#define LOCAL_ALIGNMENT(TYPE, ALIGN) DATA_ALIGNMENT (TYPE, ALIGN)
+
+/* Define if operations between registers always perform the operation
+   on the full register even if a narrower mode is specified.  */
+#define WORD_REGISTER_OPERATIONS 1
+
+/* When in 64-bit mode, move insns will sign extend SImode and FCCmode
+   moves.  All other references are zero extended.  */
+#define LOAD_EXTEND_OP(MODE) \
+  (TARGET_64BIT && ((MODE) == SImode || (MODE) == FCCmode) ? SIGN_EXTEND \
+							   : ZERO_EXTEND)
+
+/* Define this macro if it is advisable to hold scalars in registers
+   in a wider mode than that declared by the program.  In such cases,
+   the value is constrained to be within the bounds of the declared
+   type, but kept valid in the wider mode.  The signedness of the
+   extension may differ from that of the type.  */
+
+#define PROMOTE_MODE(MODE, UNSIGNEDP, TYPE) \
+  if (GET_MODE_CLASS (MODE) == MODE_INT \
+      && GET_MODE_SIZE (MODE) < UNITS_PER_WORD) \
+    { \
+      if ((MODE) == SImode) \
+	(UNSIGNEDP) = 0; \
+      (MODE) = Pmode; \
+    }
+
+/* Pmode is always the same as ptr_mode, but not always the same as word_mode.
+   Extensions of pointers to word_mode must be signed.  */
+#define POINTERS_EXTEND_UNSIGNED false
+
+/* Define if loading short immediate values into registers sign extends.  */
+#define SHORT_IMMEDIATES_SIGN_EXTEND 1
+
+/* The clz.{w/d} instructions have the natural values at 0.  */
+
+#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
+  ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
+
+/* Standard register usage.  */
+
+/* Number of hardware registers.  We have:
+
+   - 32 integer registers
+   - 32 floating point registers
+   - 8 condition code registers
+   - 2 fake registers:
+	- ARG_POINTER_REGNUM
+	- FRAME_POINTER_REGNUM
+*/
+
+#define FIRST_PSEUDO_REGISTER 74
+
+/* zero, tp, sp and x are fixed.  */
+#define FIXED_REGISTERS							\
+{ /* General registers.  */						\
+  1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,			\
+  0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,			\
+  /* Floating-point registers.  */					\
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,			\
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,			\
+  /* Others.  */							\
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1}
+
+/* The call RTLs themselves clobber ra.  */
+#define CALL_USED_REGISTERS						\
+{ /* General registers.  */						\
+  1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,			\
+  1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,			\
+  /* Floating-point registers.  */					\
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,			\
+  1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,			\
+  /* Others.  */							\
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+
+/* Internal macros to classify a register number as to whether it's a
+   general purpose register, a floating point register, or a status
+   register.  */
+
+#define GP_REG_FIRST 0
+#define GP_REG_LAST 31
+#define GP_REG_NUM (GP_REG_LAST - GP_REG_FIRST + 1)
+
+#define FP_REG_FIRST 32
+#define FP_REG_LAST 63
+#define FP_REG_NUM (FP_REG_LAST - FP_REG_FIRST + 1)
+
+/* The DWARF 2 CFA column which tracks the return address from a
+   signal handler context.  This means that to maintain backwards
+   compatibility, no hard register can be assigned this column if it
+   would need to be handled by the DWARF unwinder.  */
+#define DWARF_ALT_FRAME_RETURN_COLUMN 72
+
+#define FCC_REG_FIRST 64
+#define FCC_REG_LAST 71
+#define FCC_REG_NUM (FCC_REG_LAST - FCC_REG_FIRST + 1)
+
+#define GP_REG_P(REGNO) \
+  ((unsigned int) ((int) (REGNO) - GP_REG_FIRST) < GP_REG_NUM)
+#define FP_REG_P(REGNO) \
+  ((unsigned int) ((int) (REGNO) - FP_REG_FIRST) < FP_REG_NUM)
+#define FCC_REG_P(REGNO) \
+  ((unsigned int) ((int) (REGNO) - FCC_REG_FIRST) < FCC_REG_NUM)
+
+#define FP_REG_RTX_P(X) (REG_P (X) && FP_REG_P (REGNO (X)))
+
+/* Select a register mode required for caller save of hard regno REGNO.  */
+#define HARD_REGNO_CALLER_SAVE_MODE(REGNO, NREGS, MODE) \
+  loongarch_hard_regno_caller_save_mode (REGNO, NREGS, MODE)
+
+/* Register to use for pushing function arguments.  */
+#define STACK_POINTER_REGNUM (GP_REG_FIRST + 3)
+
+/* These two registers don't really exist: they get eliminated to either
+   the stack or hard frame pointer.  */
+#define ARG_POINTER_REGNUM 72
+#define FRAME_POINTER_REGNUM 73
+
+#define HARD_FRAME_POINTER_REGNUM (GP_REG_FIRST + 22)
+
+#define HARD_FRAME_POINTER_IS_FRAME_POINTER 0
+#define HARD_FRAME_POINTER_IS_ARG_POINTER 0
+
+/* Register in which static-chain is passed to a function.  */
+#define STATIC_CHAIN_REGNUM (GP_REG_FIRST + 20) /* $t8  */
+
+#define GP_TEMP_FIRST (GP_REG_FIRST + 12)
+#define LARCH_PROLOGUE_TEMP_REGNUM (GP_TEMP_FIRST + 1)
+#define LARCH_PROLOGUE_TEMP2_REGNUM (GP_TEMP_FIRST)
+#define LARCH_PROLOGUE_TEMP3_REGNUM (GP_TEMP_FIRST + 2)
+#define LARCH_EPILOGUE_TEMP_REGNUM (GP_TEMP_FIRST)
+
+#define CALLEE_SAVED_REG_NUMBER(REGNO) \
+  ((REGNO) >= 22 && (REGNO) <= 31 ? (REGNO) - 22 : -1)
+
+#define LARCH_PROLOGUE_TEMP(MODE) \
+  gen_rtx_REG (MODE, LARCH_PROLOGUE_TEMP_REGNUM)
+#define LARCH_PROLOGUE_TEMP2(MODE) \
+  gen_rtx_REG (MODE, LARCH_PROLOGUE_TEMP2_REGNUM)
+#define LARCH_PROLOGUE_TEMP3(MODE) \
+  gen_rtx_REG (MODE, LARCH_PROLOGUE_TEMP3_REGNUM)
+#define LARCH_EPILOGUE_TEMP(MODE) \
+  gen_rtx_REG (MODE, LARCH_EPILOGUE_TEMP_REGNUM)
+
+/* Define this macro if it is as good or better to call a constant
+   function address than to call an address kept in a register.  */
+#define NO_FUNCTION_CSE 1
+
+#define THREAD_POINTER_REGNUM (GP_REG_FIRST + 2)
+
+/* Define the classes of registers for register constraints in the
+   machine description.  Also define ranges of constants.
+
+   One of the classes must always be named ALL_REGS and include all hard regs.
+   If there is more than one class, another class must be named NO_REGS
+   and contain no registers.
+
+   The name GENERAL_REGS must be the name of a class (or an alias for
+   another name such as ALL_REGS).  This is the class of registers
+   that is allowed by "r" in a register constraint.
+   Also, registers outside this class are allocated only when
+   instructions express preferences for them.
+
+   The classes must be numbered in nondecreasing order; that is,
+   a larger-numbered class must never be contained completely
+   in a smaller-numbered class.
+
+   For any two classes, it is very desirable that there be another
+   class that represents their union.  */
+
+enum reg_class
+{
+  NO_REGS,	  /* no registers in set  */
+  SIBCALL_REGS,	  /* registers used by indirect sibcalls  */
+  JIRL_REGS,	  /* registers used by indirect calls  */
+  CSR_REGS,	  /* integer registers except for $r0 and $r1 for lcsr.  */
+  GR_REGS,	  /* integer registers  */
+  FP_REGS,	  /* floating point registers  */
+  FCC_REGS,	  /* status registers (fp status)  */
+  FRAME_REGS,	  /* arg pointer and frame pointer  */
+  ALL_REGS,	  /* all registers  */
+  LIM_REG_CLASSES /* max value + 1  */
+};
+
+#define N_REG_CLASSES (int) LIM_REG_CLASSES
+
+#define GENERAL_REGS GR_REGS
+
+/* An initializer containing the names of the register classes as C
+   string constants.  These names are used in writing some of the
+   debugging dumps.  */
+
+#define REG_CLASS_NAMES							\
+{									\
+  "NO_REGS",								\
+  "SIBCALL_REGS",							\
+  "JIRL_REGS",								\
+  "CSR_REGS",								\
+  "GR_REGS",								\
+  "FP_REGS",								\
+  "FCC_REGS",								\
+  "FRAME_REGS",								\
+  "ALL_REGS"								\
+}
+
+/* An initializer containing the contents of the register classes,
+   as integers which are bit masks.  The Nth integer specifies the
+   contents of class N.  The way the integer MASK is interpreted is
+   that register R is in the class if `MASK & (1 << R)' is 1.
+
+   When the machine has more than 32 registers, an integer does not
+   suffice.  Then the integers are replaced by sub-initializers,
+   braced groupings containing several integers.  Each
+   sub-initializer must be suitable as an initializer for the type
+   `HARD_REG_SET' which is defined in `hard-reg-set.h'.  */
+
+#define REG_CLASS_CONTENTS						\
+{									\
+  { 0x00000000, 0x00000000, 0x00000000 },	/* NO_REGS  */		\
+  { 0x001ff000, 0x00000000, 0x00000000 },	/* SIBCALL_REGS  */	\
+  { 0xff9ffff0, 0x00000000, 0x00000000 },	/* JIRL_REGS  */	\
+  { 0xfffffffc, 0x00000000, 0x00000000 },	/* CSR_REGS  */		\
+  { 0xffffffff, 0x00000000, 0x00000000 },	/* GR_REGS  */		\
+  { 0x00000000, 0xffffffff, 0x00000000 },	/* FP_REGS  */		\
+  { 0x00000000, 0x00000000, 0x000000ff },	/* FCC_REGS  */		\
+  { 0x00000000, 0x00000000, 0x00000300 },	/* FRAME_REGS  */	\
+  { 0xffffffff, 0xffffffff, 0x000003ff }	/* ALL_REGS  */		\
+}
+
+/* A C expression whose value is a register class containing hard
+   register REGNO.  In general there is more that one such class;
+   choose a class which is "minimal", meaning that no smaller class
+   also contains the register.  */
+
+#define REGNO_REG_CLASS(REGNO) loongarch_regno_to_class[(REGNO)]
+
+/* A macro whose definition is the name of the class to which a
+   valid base register must belong.  A base register is one used in
+   an address which is the register value plus a displacement.  */
+
+#define BASE_REG_CLASS (GR_REGS)
+
+/* A macro whose definition is the name of the class to which a
+   valid index register must belong.  An index register is one used
+   in an address where its value is either multiplied by a scale
+   factor or added to another register (as well as added to a
+   displacement).  */
+
+#define INDEX_REG_CLASS GR_REGS
+
+/* We generally want to put call-clobbered registers ahead of
+   call-saved ones.  (IRA expects this.)  */
+
+#define REG_ALLOC_ORDER							\
+{ /* Call-clobbered GPRs.  */						\
+  12, 13, 14, 15, 16, 17, 18, 19, 20, 4, 5, 6, 7, 8, 9, 10, 11, 1,	\
+  /* Call-saved GPRs.  */						\
+  23, 24, 25, 26, 27, 28, 29, 30, 31,					\
+  /* GPRs that can never be exposed to the register allocator.  */	\
+  0, 2, 3, 21, 22, 							\
+  /* Call-clobbered FPRs.  */						\
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,	\
+  48, 49, 50, 51,52, 53, 54, 55, 					\
+  56, 57, 58, 59, 60, 61, 62, 63,					\
+  /* None of the remaining classes have defined call-saved		\
+     registers.  */							\
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73}
+
+#define IMM_BITS 12
+#define IMM_REACH (1LL << IMM_BITS)
+
+/* True if VALUE is an unsigned 6-bit number.  */
+
+#define UIMM6_OPERAND(VALUE) (((VALUE) & ~(unsigned HOST_WIDE_INT) 0x3f) == 0)
+
+/* True if VALUE is a signed 10-bit number.  */
+
+#define IMM10_OPERAND(VALUE) ((unsigned HOST_WIDE_INT) (VALUE) + 0x200 < 0x400)
+
+/* True if VALUE is a signed 12-bit number.  */
+
+#define IMM12_OPERAND(VALUE) \
+  ((unsigned HOST_WIDE_INT) (VALUE) + IMM_REACH / 2 < IMM_REACH)
+
+/* True if VALUE is a signed 16-bit number.  */
+
+#define IMM16_OPERAND(VALUE) \
+  ((unsigned HOST_WIDE_INT) (VALUE) + 0x8000 < 0x10000)
+
+/* True if VALUE is an unsigned 12-bit number.  */
+
+#define IMM12_OPERAND_UNSIGNED(VALUE) \
+  (((VALUE) & ~(unsigned HOST_WIDE_INT) (IMM_REACH - 1)) == 0)
+
+/* True if VALUE can be loaded into a register using LU12I.  */
+
+#define LU12I_OPERAND(VALUE) \
+  (((VALUE) | ((1UL << 31) - IMM_REACH)) == ((1UL << 31) - IMM_REACH) \
+   || ((VALUE) | ((1UL << 31) - IMM_REACH)) + IMM_REACH == 0)
+
+/* True if VALUE can be loaded into a register using LU32I.  */
+
+#define LU32I_OPERAND(VALUE) \
+  (((VALUE) | (((1ULL << 19) - 1) << 32)) == (((1ULL << 19) - 1) << 32) \
+   || ((VALUE) | (((1ULL << 19) - 1) << 32)) + (1ULL << 32) == 0)
+
+/* True if VALUE can be loaded into a register using LU52I.  */
+
+#define LU52I_OPERAND(VALUE) (((VALUE) | (0xfffULL << 52)) == (0xfffULL << 52))
+
+/* Return a value X with the low 12 bits clear, and such that
+   VALUE - X is a signed 12-bit value.  */
+
+#define CONST_HIGH_PART(VALUE) (((VALUE) + (IMM_REACH / 2)) & ~(IMM_REACH - 1))
+
+#define CONST_LOW_PART(VALUE) ((VALUE) - CONST_HIGH_PART (VALUE))
+
+#define IMM12_INT(X) IMM12_OPERAND (INTVAL (X))
+#define IMM12_INT_UNSIGNED(X) IMM12_OPERAND_UNSIGNED (INTVAL (X))
+#define LU12I_INT(X) LU12I_OPERAND (INTVAL (X))
+#define LU32I_INT(X) LU32I_OPERAND (INTVAL (X))
+#define LU52I_INT(X) LU52I_OPERAND (INTVAL (X))
+#define LARCH_U12BIT_OFFSET_P(OFFSET) (IN_RANGE (OFFSET, -2048, 2047))
+#define LARCH_9BIT_OFFSET_P(OFFSET) (IN_RANGE (OFFSET, -256, 255))
+#define LARCH_16BIT_OFFSET_P(OFFSET) (IN_RANGE (OFFSET, -32768, 32767))
+#define LARCH_SHIFT_2_OFFSET_P(OFFSET) (((OFFSET) & 0x3) == 0)
+
+/* Return the maximum number of consecutive registers
+   needed to represent mode MODE in a register of class CLASS.  */
+
+#define CLASS_MAX_NREGS(CLASS, MODE) loongarch_class_max_nregs (CLASS, MODE)
+
+/* Stack layout; function entry, exit and calling.  */
+
+#define STACK_GROWS_DOWNWARD 1
+
+#define FRAME_GROWS_DOWNWARD 1
+
+#define RETURN_ADDR_RTX loongarch_return_addr
+
+/* Similarly, don't use the least-significant bit to tell pointers to
+   code from vtable index.  */
+
+#define TARGET_PTRMEMFUNC_VBIT_LOCATION ptrmemfunc_vbit_in_delta
+
+#define ELIMINABLE_REGS \
+  { \
+    {ARG_POINTER_REGNUM, STACK_POINTER_REGNUM}, \
+    {ARG_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM}, \
+    {FRAME_POINTER_REGNUM, STACK_POINTER_REGNUM}, \
+    {FRAME_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM}, \
+  }
+
+#define INITIAL_ELIMINATION_OFFSET(FROM, TO, OFFSET) \
+  (OFFSET) = loongarch_initial_elimination_offset ((FROM), (TO))
+
+/* Allocate stack space for arguments at the beginning of each function.  */
+#define ACCUMULATE_OUTGOING_ARGS 1
+
+/* The argument pointer always points to the first argument.  */
+#define FIRST_PARM_OFFSET(FNDECL) 0
+
+#define REG_PARM_STACK_SPACE(FNDECL) 0
+
+/* Define this if it is the responsibility of the caller to
+   allocate the area reserved for arguments passed in registers.
+   If `ACCUMULATE_OUTGOING_ARGS' is also defined, the only effect
+   of this macro is to determine whether the space is included in
+   `crtl->outgoing_args_size'.  */
+#define OUTGOING_REG_PARM_STACK_SPACE(FNTYPE) 1
+
+#define STACK_BOUNDARY (TARGET_ABI_LP64 ? 128 : 64)
+
+/* Symbolic macros for the registers used to return integer and floating
+   point values.  */
+
+#define GP_RETURN (GP_REG_FIRST + 4)
+#define FP_RETURN ((TARGET_SOFT_FLOAT) ? GP_RETURN : (FP_REG_FIRST + 0))
+
+#define MAX_ARGS_IN_REGISTERS 8
+
+/* Symbolic macros for the first/last argument registers.  */
+
+#define GP_ARG_FIRST (GP_REG_FIRST + 4)
+#define GP_ARG_LAST (GP_ARG_FIRST + MAX_ARGS_IN_REGISTERS - 1)
+#define FP_ARG_FIRST (FP_REG_FIRST + 0)
+#define FP_ARG_LAST (FP_ARG_FIRST + MAX_ARGS_IN_REGISTERS - 1)
+
+/* 1 if N is a possible register number for function argument passing.
+   We have no FP argument registers when soft-float.  */
+
+/* Accept arguments in a0-a7, and in fa0-fa7 if permitted by the ABI.  */
+#define FUNCTION_ARG_REGNO_P(N) \
+  (IN_RANGE ((N), GP_ARG_FIRST, GP_ARG_LAST) \
+   || (UNITS_PER_FP_ARG && IN_RANGE ((N), FP_ARG_FIRST, FP_ARG_LAST)))
+
+typedef struct {
+  /* Number of integer registers used so far, up to MAX_ARGS_IN_REGISTERS.  */
+  unsigned int num_gprs;
+
+  /* Number of floating-point registers used so far, likewise.  */
+  unsigned int num_fprs;
+
+} CUMULATIVE_ARGS;
+
+/* Initialize a variable CUM of type CUMULATIVE_ARGS
+   for a call to a function whose data type is FNTYPE.
+   For a library call, FNTYPE is 0.  */
+
+#define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, INDIRECT, N_NAMED_ARGS) \
+  memset (&(CUM), 0, sizeof (CUM))
+
+#define EPILOGUE_USES(REGNO) loongarch_epilogue_uses (REGNO)
+
+/* Treat LOC as a byte offset from the stack pointer and round it up
+   to the next fully-aligned offset.  */
+#define LARCH_STACK_ALIGN(LOC) \
+  (TARGET_ABI_LP64 ? ROUND_UP ((LOC), 16) : ROUND_UP ((LOC), 8))
+
+#define MCOUNT_NAME "_mcount"
+
+/* Emit rtl for profiling.  Output assembler code to FILE
+   to call "_mcount" for profiling a function entry.  */
+#define PROFILE_HOOK(LABEL) \
+  { \
+    rtx fun, ra; \
+    ra = get_hard_reg_initial_val (Pmode, RETURN_ADDR_REGNUM); \
+    fun = gen_rtx_SYMBOL_REF (Pmode, MCOUNT_NAME); \
+    emit_library_call (fun, LCT_NORMAL, VOIDmode, ra, Pmode); \
+  }
+
+/* All the work done in PROFILE_HOOK, but still required.  */
+#define FUNCTION_PROFILER(STREAM, LABELNO) do { } while (0)
+
+#define NO_PROFILE_COUNTERS 1
+
+/* EXIT_IGNORE_STACK should be nonzero if, when returning from a function,
+   the stack pointer does not matter.  The value is tested only in
+   functions that have frame pointers.
+   No definition is equivalent to always zero.  */
+
+#define EXIT_IGNORE_STACK 1
+
+/* Trampolines are a block of code followed by two pointers.  */
+
+#define TRAMPOLINE_CODE_SIZE 16
+#define TRAMPOLINE_SIZE \
+  ((Pmode == SImode) ? TRAMPOLINE_CODE_SIZE \
+		     : (TRAMPOLINE_CODE_SIZE + POINTER_SIZE * 2))
+#define TRAMPOLINE_ALIGNMENT POINTER_SIZE
+
+/* loongarch_trampoline_init calls this library function to flush
+   program and data caches.  */
+
+#ifndef CACHE_FLUSH_FUNC
+#define CACHE_FLUSH_FUNC "_flush_cache"
+#endif
+
+/* Addressing modes, and classification of registers for them.  */
+
+#define REGNO_OK_FOR_INDEX_P(REGNO) \
+  loongarch_regno_mode_ok_for_base_p (REGNO, VOIDmode, 1)
+
+#define REGNO_MODE_OK_FOR_BASE_P(REGNO, MODE) \
+  loongarch_regno_mode_ok_for_base_p (REGNO, MODE, 1)
+
+/* Maximum number of registers that can appear in a valid memory address.  */
+
+#define MAX_REGS_PER_ADDRESS 2
+
+/* Check for constness inline but use loongarch_legitimate_address_p
+   to check whether a constant really is an address.  */
+
+#define CONSTANT_ADDRESS_P(X) (CONSTANT_P (X) && memory_address_p (SImode, X))
+
+/* This handles the magic '..CURRENT_FUNCTION' symbol, which means
+   'the start of the function that this code is output in'.  */
+
+#define ASM_OUTPUT_LABELREF(FILE, NAME) \
+  do \
+    { \
+      if (strcmp (NAME, "..CURRENT_FUNCTION") == 0) \
+	asm_fprintf ((FILE), "%U%s", \
+		     XSTR (XEXP (DECL_RTL (current_function_decl), 0), 0)); \
+      else \
+	asm_fprintf ((FILE), "%U%s", (NAME)); \
+    } \
+  while (0)
+
+#define CASE_VECTOR_MODE Pmode
+
+#define CASE_VECTOR_SHORTEN_MODE(MIN, MAX, BODY) Pmode
+
+/* Define this as 1 if `char' should by default be signed; else as 0.  */
+#ifndef DEFAULT_SIGNED_CHAR
+#define DEFAULT_SIGNED_CHAR 1
+#endif
+
+/* The SPARC port says:
+   The maximum number of bytes that a single instruction
+   can move quickly between memory and registers or between
+   two memory locations.  */
+#define MOVE_MAX UNITS_PER_WORD
+#define MAX_MOVE_MAX 8
+
+/* The SPARC port says:
+   Nonzero if access to memory by bytes is slow and undesirable.
+   For RISC chips, it means that access to memory by bytes is no
+   better than access by words when possible, so grab a whole word
+   and maybe make use of that.  */
+#define SLOW_BYTE_ACCESS 1
+
+/* Standard LoongArch integer shifts truncate the shift amount to the
+   width of the shifted operand.  */
+#define SHIFT_COUNT_TRUNCATED 1
+
+/* Specify the machine mode that pointers have.
+   After generation of rtl, the compiler makes no further distinction
+   between pointers and any other objects of this machine mode.  */
+
+#ifndef Pmode
+#define Pmode (TARGET_64BIT ? DImode : SImode)
+#endif
+
+/* Give call MEMs SImode since it is the "most permissive" mode
+   for both 32-bit and 64-bit targets.  */
+
+#define FUNCTION_MODE SImode
+
+/* We allocate $fcc registers by hand and can't cope with moves of
+   CCmode registers to and from pseudos (or memory).  */
+#define AVOID_CCMODE_COPIES
+
+/* A C expression for the cost of a branch instruction.  A value of
+   1 is the default; other values are interpreted relative to that.  */
+
+#define BRANCH_COST(speed_p, predictable_p) loongarch_branch_cost
+#define LOGICAL_OP_NON_SHORT_CIRCUIT 0
+
+/* Return the asm template for a conditional branch instruction.
+   OPCODE is the opcode's mnemonic and OPERANDS is the asm template for
+   its operands.  */
+#define LARCH_BRANCH(OPCODE, OPERANDS) OPCODE "\t" OPERANDS
+
+/* Control the assembler format that we output.  */
+
+/* Output to assembler file text saying following lines
+   may contain character constants, extra white space, comments, etc.  */
+
+#ifndef ASM_APP_ON
+#define ASM_APP_ON " #APP\n"
+#endif
+
+/* Output to assembler file text saying following lines
+   no longer contain unusual constructs.  */
+
+#ifndef ASM_APP_OFF
+#define ASM_APP_OFF " #NO_APP\n"
+#endif
+
+#define REGISTER_NAMES							  \
+{ "$r0",   "$r1",   "$r2",   "$r3",   "$r4",   "$r5",   "$r6",   "$r7",   \
+  "$r8",   "$r9",   "$r10",  "$r11",  "$r12",  "$r13",  "$r14",  "$r15",  \
+  "$r16",  "$r17",  "$r18",  "$r19",  "$r20",  "$r21",  "$r22",  "$r23",  \
+  "$r24",  "$r25",  "$r26",  "$r27",  "$r28",  "$r29",  "$r30",  "$r31",  \
+  "$f0",  "$f1",  "$f2",  "$f3",  "$f4",  "$f5",  "$f6",  "$f7",	  \
+  "$f8",  "$f9",  "$f10", "$f11", "$f12", "$f13", "$f14", "$f15",	  \
+  "$f16", "$f17", "$f18", "$f19", "$f20", "$f21", "$f22", "$f23",	  \
+  "$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31",	  \
+  "$fcc0","$fcc1","$fcc2","$fcc3","$fcc4","$fcc5","$fcc6","$fcc7",	  \
+  "$arg", "$frame"}
+
+/* This macro defines additional names for hard registers.  */
+
+#define ADDITIONAL_REGISTER_NAMES					\
+{									\
+  { "zero",	 0 + GP_REG_FIRST },					\
+  { "ra",	 1 + GP_REG_FIRST },					\
+  { "tp",	 2 + GP_REG_FIRST },					\
+  { "sp",	 3 + GP_REG_FIRST },					\
+  { "a0",	 4 + GP_REG_FIRST },					\
+  { "a1",	 5 + GP_REG_FIRST },					\
+  { "a2",	 6 + GP_REG_FIRST },					\
+  { "a3",	 7 + GP_REG_FIRST },					\
+  { "a4",	 8 + GP_REG_FIRST },					\
+  { "a5",	 9 + GP_REG_FIRST },					\
+  { "a6",	10 + GP_REG_FIRST },					\
+  { "a7",	11 + GP_REG_FIRST },					\
+  { "t0",	12 + GP_REG_FIRST },					\
+  { "t1",	13 + GP_REG_FIRST },					\
+  { "t2",	14 + GP_REG_FIRST },					\
+  { "t3",	15 + GP_REG_FIRST },					\
+  { "t4",	16 + GP_REG_FIRST },					\
+  { "t5",	17 + GP_REG_FIRST },					\
+  { "t6",	18 + GP_REG_FIRST },					\
+  { "t7",	19 + GP_REG_FIRST },					\
+  { "t8",	20 + GP_REG_FIRST },					\
+  { "x",	21 + GP_REG_FIRST },					\
+  { "fp",	22 + GP_REG_FIRST },					\
+  { "s0",	23 + GP_REG_FIRST },					\
+  { "s1",	24 + GP_REG_FIRST },					\
+  { "s2",	25 + GP_REG_FIRST },					\
+  { "s3",	26 + GP_REG_FIRST },					\
+  { "s4",	27 + GP_REG_FIRST },					\
+  { "s5",	28 + GP_REG_FIRST },					\
+  { "s6",	29 + GP_REG_FIRST },					\
+  { "s7",	30 + GP_REG_FIRST },					\
+  { "s8",	31 + GP_REG_FIRST },					\
+  { "v0",	 4 + GP_REG_FIRST },					\
+  { "v1",	 5 + GP_REG_FIRST }					\
+}
+
+/* Globalizing directive for a label.  */
+#define GLOBAL_ASM_OP "\t.globl\t"
+
+/* This says how to output an external.  It would be possible not to
+   output anything and let undefined symbol become external.  However
+   the assembler uses length information on externals to allocate in
+   data/sdata bss/sbss, thereby saving exec time.  */
+
+#undef ASM_OUTPUT_EXTERNAL
+#define ASM_OUTPUT_EXTERNAL(STREAM, DECL, NAME) \
+  loongarch_output_external (STREAM, DECL, NAME)
+
+/* This is how to store into the string LABEL
+   the symbol_ref name of an internal numbered label where
+   PREFIX is the class of label and NUM is the number within the class.
+   This is suitable for output with `assemble_name'.  */
+
+#undef ASM_GENERATE_INTERNAL_LABEL
+#define ASM_GENERATE_INTERNAL_LABEL(LABEL, PREFIX, NUM) \
+  sprintf ((LABEL), "*%s%s%ld", (LOCAL_LABEL_PREFIX), (PREFIX), (long) (NUM))
+
+/* Print debug labels as "foo = ." rather than "foo:" because they should
+   represent a byte pointer rather than an ISA-encoded address.  This is
+   particularly important for code like:
+
+	$LFBxxx = .
+		.cfi_startproc
+		...
+		.section .gcc_except_table,...
+		...
+		.uleb128 foo-$LFBxxx
+
+   The .uleb128 requies $LFBxxx to match the FDE start address, which is
+   likewise a byte pointer rather than an ISA-encoded address.
+
+   At the time of writing, this hook is not used for the function end
+   label:
+
+	$LFExxx:
+		.end foo
+
+   */
+
+#define ASM_OUTPUT_DEBUG_LABEL(FILE, PREFIX, NUM) \
+  fprintf (FILE, "%s%s%d = .\n", LOCAL_LABEL_PREFIX, PREFIX, NUM)
+
+/* This is how to output an element of a case-vector that is absolute.  */
+
+#define ASM_OUTPUT_ADDR_VEC_ELT(STREAM, VALUE) \
+  fprintf (STREAM, "\t%s\t%sL%d\n", ptr_mode == DImode ? ".dword" : ".word", \
+	   LOCAL_LABEL_PREFIX, VALUE)
+
+/* This is how to output an element of a case-vector.  */
+
+#define ASM_OUTPUT_ADDR_DIFF_ELT(STREAM, BODY, VALUE, REL) \
+  do \
+    { \
+      fprintf (STREAM, "\t%s\t%sL%d-%sL%d\n", \
+	       ptr_mode == DImode ? ".dword" : ".word", LOCAL_LABEL_PREFIX, \
+	       VALUE, LOCAL_LABEL_PREFIX, REL); \
+    } \
+  while (0)
+
+#define JUMP_TABLES_IN_TEXT_SECTION 0
+
+/* This is how to output an assembler line
+   that says to advance the location counter
+   to a multiple of 2**LOG bytes.  */
+
+#define ASM_OUTPUT_ALIGN(STREAM, LOG) fprintf (STREAM, "\t.align\t%d\n", (LOG))
+
+/* "nop" instruction 54525952 (andi $r0,$r0,0) is
+   used for padding.  */
+#define ASM_OUTPUT_ALIGN_WITH_NOP(STREAM, LOG) \
+  fprintf (STREAM, "\t.align\t%d,54525952,4\n", (LOG))
+
+/* This is how to output an assembler line to advance the location
+   counter by SIZE bytes.  */
+
+#undef ASM_OUTPUT_SKIP
+#define ASM_OUTPUT_SKIP(STREAM, SIZE) \
+  fprintf (STREAM, "\t.space\t" HOST_WIDE_INT_PRINT_UNSIGNED "\n", (SIZE))
+
+/* This is how to output a string.  */
+#undef ASM_OUTPUT_ASCII
+#define ASM_OUTPUT_ASCII loongarch_output_ascii
+
+/* Define the strings to put out for each section in the object file.  */
+#define TEXT_SECTION_ASM_OP "\t.text" /* instructions  */
+#define DATA_SECTION_ASM_OP "\t.data" /* large data  */
+
+#undef READONLY_DATA_SECTION_ASM_OP
+#define READONLY_DATA_SECTION_ASM_OP "\t.section\t.rodata" /* read-only data */
+
+#define ASM_OUTPUT_REG_PUSH(STREAM, REGNO)	\
+  do \
+    { \
+      fprintf (STREAM, "\t%s\t%s,%s,-8\n\t%s\t%s,%s,0\n", \
+	       TARGET_64BIT ? "addi.d" : "addi.w", \
+	       reg_names[STACK_POINTER_REGNUM], \
+	       reg_names[STACK_POINTER_REGNUM], \
+	       TARGET_64BIT ? "st.d" : "st.w", reg_names[REGNO], \
+	       reg_names[STACK_POINTER_REGNUM]); \
+    } \
+  while (0)
+
+#define ASM_OUTPUT_REG_POP(STREAM, REGNO) \
+  do \
+    { \
+      fprintf (STREAM, "\t%s\t%s,%s,0\n\t%s\t%s,%s,8\n", \
+	       TARGET_64BIT ? "ld.d" : "ld.w", reg_names[REGNO], \
+	       reg_names[STACK_POINTER_REGNUM], \
+	       TARGET_64BIT ? "addi.d" : "addi.w", \
+	       reg_names[STACK_POINTER_REGNUM], \
+	       reg_names[STACK_POINTER_REGNUM]); \
+    } \
+  while (0)
+
+/* How to start an assembler comment.
+   The leading space is important (the loongarch native assembler requires it).
+ */
+#ifndef ASM_COMMENT_START
+#define ASM_COMMENT_START " #"
+#endif
+
+#undef SIZE_TYPE
+#define SIZE_TYPE (POINTER_SIZE == 64 ? "long unsigned int" : "unsigned int")
+
+#undef PTRDIFF_TYPE
+#define PTRDIFF_TYPE (POINTER_SIZE == 64 ? "long int" : "int")
+
+/* The maximum number of bytes that can be copied by one iteration of
+   a cpymemsi loop; see loongarch_block_move_loop.  */
+#define LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4)
+
+/* The maximum number of bytes that can be copied by a straight-line
+   implementation of cpymemsi; see loongarch_block_move_straight.  We want
+   to make sure that any loop-based implementation will iterate at
+   least twice.  */
+#define LARCH_MAX_MOVE_BYTES_STRAIGHT (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER * 2)
+
+/* The base cost of a memcpy call, for MOVE_RATIO and friends.  These
+   values were determined experimentally by benchmarking with CSiBE.
+*/
+#define LARCH_CALL_RATIO 8
+
+/* Any loop-based implementation of cpymemsi will have at least
+   LARCH_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory
+   moves, so allow individual copies of fewer elements.
+
+   When cpymemsi is not available, use a value approximating
+   the length of a memcpy call sequence, so that move_by_pieces
+   will generate inline code if it is shorter than a function call.
+   Since move_by_pieces_ninsns counts memory-to-memory moves, but
+   we'll have to generate a load/store pair for each, halve the
+   value of LARCH_CALL_RATIO to take that into account.  */
+
+#define MOVE_RATIO(speed) \
+  (HAVE_cpymemsi \
+   ? LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD \
+   : CLEAR_RATIO (speed) / 2)
+
+/* For CLEAR_RATIO, when optimizing for size, give a better estimate
+   of the length of a memset call, but use the default otherwise.  */
+
+#define CLEAR_RATIO(speed) ((speed) ? 15 : LARCH_CALL_RATIO)
+
+/* This is similar to CLEAR_RATIO, but for a non-zero constant, so when
+   optimizing for size adjust the ratio to account for the overhead of
+   loading the constant and replicating it across the word.  */
+
+#define SET_RATIO(speed) ((speed) ? 15 : LARCH_CALL_RATIO - 2)
+
+#ifndef USED_FOR_TARGET
+extern const enum reg_class loongarch_regno_to_class[];
+extern int loongarch_dwarf_regno[];
+
+/* Information about a function's frame layout.  */
+struct GTY (()) loongarch_frame_info
+{
+  /* The size of the frame in bytes.  */
+  HOST_WIDE_INT total_size;
+
+  /* Bit X is set if the function saves or restores GPR X.  */
+  unsigned int mask;
+
+  /* Likewise FPR X.  */
+  unsigned int fmask;
+
+  /* How much the GPR save/restore routines adjust sp (or 0 if unused).  */
+  unsigned save_libcall_adjustment;
+
+  /* Offsets of fixed-point and floating-point save areas from frame
+     bottom.  */
+  HOST_WIDE_INT gp_sp_offset;
+  HOST_WIDE_INT fp_sp_offset;
+
+  /* Offset of virtual frame pointer from stack pointer/frame bottom.  */
+  HOST_WIDE_INT frame_pointer_offset;
+
+  /* Offset of hard frame pointer from stack pointer/frame bottom.  */
+  HOST_WIDE_INT hard_frame_pointer_offset;
+
+  /* The offset of arg_pointer_rtx from the bottom of the frame.  */
+  HOST_WIDE_INT arg_pointer_offset;
+};
+
+struct GTY (()) machine_function
+{
+  /* The next floating-point condition-code register to allocate
+     for 8CC targets, relative to FCC_REG_FIRST.  */
+  unsigned int next_fcc;
+
+  /* The number of extra stack bytes taken up by register varargs.
+     This area is allocated by the callee at the very top of the frame.  */
+  int varargs_size;
+
+  /* The current frame information, calculated by loongarch_compute_frame_info.
+   */
+  struct loongarch_frame_info frame;
+};
+#endif
+
+#define ASM_PREFERRED_EH_DATA_FORMAT(CODE, GLOBAL) \
+  (((GLOBAL) ? DW_EH_PE_indirect : 0) | DW_EH_PE_absptr)
+
+/* Do emit .note.GNU-stack by default.  */
+#ifndef NEED_INDICATE_EXEC_STACK
+#define NEED_INDICATE_EXEC_STACK 1
+#endif
+
+/* The `Q' extension is not yet supported.  */
+/* TODO: according to march.  */
+#define UNITS_PER_FP_REG (TARGET_DOUBLE_FLOAT ? 8 : 4)
+
+/* The largest type that can be passed in floating-point registers.  */
+/* TODO: according to mabi.  */
+#define UNITS_PER_FP_ARG  \
+  (TARGET_HARD_FLOAT ? (TARGET_DOUBLE_FLOAT ? 8 : 4) : 0)
+
+#define FUNCTION_VALUE_REGNO_P(N) ((N) == GP_RETURN || (N) == FP_RETURN)
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
new file mode 100644
index 0000000..d3c809e
--- /dev/null
+++ b/gcc/config/loongarch/loongarch.md
@@ -0,0 +1,3415 @@
+;; Machine Description for LoongArch for GNU compiler.
+;; Copyright (C) 2021-2022 Free Software Foundation, Inc.
+;; Contributed by Loongson Ltd.
+;; Based on MIPS target for GNU compiler.
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_c_enum "unspec" [
+  ;; Integer operations that are too cumbersome to describe directly.
+  UNSPEC_REVB_2H
+  UNSPEC_REVB_4H
+  UNSPEC_REVH_D
+
+  ;; Floating-point moves.
+  UNSPEC_LOAD_LOW
+  UNSPEC_LOAD_HIGH
+  UNSPEC_STORE_WORD
+  UNSPEC_MOVGR2FRH
+  UNSPEC_MOVFRH2GR
+
+  ;; Floating point unspecs.
+  UNSPEC_FRINT
+  UNSPEC_FCLASS
+
+  ;; Override return address for exception handling.
+  UNSPEC_EH_RETURN
+
+  ;; Bit operation
+  UNSPEC_BYTEPICK_W
+  UNSPEC_BYTEPICK_D
+  UNSPEC_BITREV_4B
+  UNSPEC_BITREV_8B
+
+  ;; TLS
+  UNSPEC_TLS_GD
+  UNSPEC_TLS_LD
+  UNSPEC_TLS_LE
+  UNSPEC_TLS_IE
+
+  ;; Stack tie
+  UNSPEC_TIE
+
+  ;; CRC
+  UNSPEC_CRC
+  UNSPEC_CRCC
+])
+
+(define_c_enum "unspecv" [
+  ;; Blockage and synchronisation.
+  UNSPECV_BLOCKAGE
+  UNSPECV_DBAR
+  UNSPECV_IBAR
+
+  ;; Privileged instructions
+  UNSPECV_CSRRD
+  UNSPECV_CSRWR
+  UNSPECV_CSRXCHG
+  UNSPECV_IOCSRRD
+  UNSPECV_IOCSRWR
+  UNSPECV_CACOP
+  UNSPECV_LDDIR
+  UNSPECV_LDPTE
+  UNSPECV_ERTN
+
+  ;; Stack checking.
+  UNSPECV_PROBE_STACK_RANGE
+
+  ;; Floating-point environment.
+  UNSPECV_MOVFCSR2GR
+  UNSPECV_MOVGR2FCSR
+
+  ;; Others
+  UNSPECV_CPUCFG
+  UNSPECV_ASRTLE_D
+  UNSPECV_ASRTGT_D
+  UNSPECV_SYSCALL
+  UNSPECV_BREAK
+])
+
+(define_constants
+  [(RETURN_ADDR_REGNUM		1)
+   (T0_REGNUM			12)
+   (T1_REGNUM			13)
+   (S0_REGNUM			23)
+
+   ;; PIC long branch sequences are never longer than 100 bytes.
+   (MAX_PIC_BRANCH_LENGTH	100)
+])
+
+(include "predicates.md")
+(include "constraints.md")
+
+;; ....................
+;;
+;;	Attributes
+;;
+;; ....................
+
+(define_attr "got" "unset,load"
+  (const_string "unset"))
+
+;; For jirl instructions, this attribute is DIRECT when the target address
+;; is symbolic and INDIRECT when it is a register.
+(define_attr "jirl" "unset,direct,indirect"
+  (const_string "unset"))
+
+
+;; Classification of moves, extensions and truncations.  Most values
+;; are as for "type" (see below) but there are also the following
+;; move-specific values:
+;;
+;; sll0		"slli.w DEST,SRC,0", which on 64-bit targets is guaranteed
+;;		to produce a sign-extended DEST, even if SRC is not
+;;		properly sign-extended
+;; pick_ins	BSTRPICK.W, BSTRPICK.D, BSTRINS.W or BSTRINS.D instruction
+;; andi		a single ANDI instruction
+;; shift_shift	a shift left followed by a shift right
+;;
+;; This attribute is used to determine the instruction's length and
+;; scheduling type.  For doubleword moves, the attribute always describes
+;; the split instructions; in some cases, it is more appropriate for the
+;; scheduling type to be "multi" instead.
+(define_attr "move_type"
+  "unknown,load,fpload,store,fpstore,mgtf,mftg,imul,move,fmove,
+   const,signext,pick_ins,logical,arith,sll0,andi,shift_shift"
+  (const_string "unknown"))
+
+(define_attr "alu_type" "unknown,add,sub,not,nor,and,or,xor"
+  (const_string "unknown"))
+
+;; Main data type used by the insn
+(define_attr "mode" "unknown,none,QI,HI,SI,DI,TI,SF,DF,TF,FCC"
+  (const_string "unknown"))
+
+;; True if the main data type is twice the size of a word.
+(define_attr "dword_mode" "no,yes"
+  (cond [(and (eq_attr "mode" "DI,DF")
+	      (not (match_test "TARGET_64BIT")))
+	 (const_string "yes")
+
+	 (and (eq_attr "mode" "TI,TF")
+	      (match_test "TARGET_64BIT"))
+	 (const_string "yes")]
+	(const_string "no")))
+
+;; True if the main data type is four times of the size of a word.
+(define_attr "qword_mode" "no,yes"
+  (cond [(and (eq_attr "mode" "TI,TF")
+	      (not (match_test "TARGET_64BIT")))
+	 (const_string "yes")]
+	(const_string "no")))
+
+;; Classification of each insn.
+;; branch	conditional branch
+;; jump		unconditional jump
+;; call		unconditional call
+;; load		load instruction(s)
+;; fpload	floating point load
+;; fpidxload    floating point indexed load
+;; store	store instruction(s)
+;; fpstore	floating point store
+;; fpidxstore	floating point indexed store
+;; prefetch	memory prefetch (register + offset)
+;; prefetchx	memory indexed prefetch (register + register)
+;; condmove	conditional moves
+;; mgtf		move general-purpose register to floating point register
+;; mftg		move floating point register to general-purpose register
+;; const	load constant
+;; arith	integer arithmetic instructions
+;; logical      integer logical instructions
+;; shift	integer shift instructions
+;; slt		set less than instructions
+;; signext      sign extend instructions
+;; clz		the clz and clo instructions
+;; trap		trap if instructions
+;; imul		integer multiply
+;; idiv		integer divide
+;; move		integer move
+;; fmove	floating point register move
+;; fadd		floating point add/subtract
+;; fmul		floating point multiply
+;; fmadd	floating point multiply-add
+;; fdiv		floating point divide
+;; frdiv	floating point reciprocal divide
+;; fabs		floating point absolute value
+;; fneg		floating point negation
+;; fcmp		floating point compare
+;; fcvt		floating point convert
+;; fsqrt	floating point square root
+;; frsqrt       floating point reciprocal square root
+;; multi	multiword sequence (or user asm statements)
+;; atomic	atomic memory update instruction
+;; syncloop	memory atomic operation implemented as a sync loop
+;; nop		no operation
+;; ghost	an instruction that produces no real code
+(define_attr "type"
+  "unknown,branch,jump,call,load,fpload,fpidxload,store,fpstore,fpidxstore,
+   prefetch,prefetchx,condmove,mgtf,mftg,const,arith,logical,
+   shift,slt,signext,clz,trap,imul,idiv,move,
+   fmove,fadd,fmul,fmadd,fdiv,frdiv,fabs,fneg,fcmp,fcvt,fsqrt,
+   frsqrt,accext,accmod,multi,atomic,syncloop,nop,ghost"
+  (cond [(eq_attr "jirl" "!unset") (const_string "call")
+	 (eq_attr "got" "load") (const_string "load")
+
+	 (eq_attr "alu_type" "add,sub") (const_string "arith")
+
+	 (eq_attr "alu_type" "not,nor,and,or,xor") (const_string "logical")
+
+	 ;; If a doubleword move uses these expensive instructions,
+	 ;; it is usually better to schedule them in the same way
+	 ;; as the singleword form, rather than as "multi".
+	 (eq_attr "move_type" "load") (const_string "load")
+	 (eq_attr "move_type" "fpload") (const_string "fpload")
+	 (eq_attr "move_type" "store") (const_string "store")
+	 (eq_attr "move_type" "fpstore") (const_string "fpstore")
+	 (eq_attr "move_type" "mgtf") (const_string "mgtf")
+	 (eq_attr "move_type" "mftg") (const_string "mftg")
+
+	 ;; These types of move are always single insns.
+	 (eq_attr "move_type" "imul") (const_string "imul")
+	 (eq_attr "move_type" "fmove") (const_string "fmove")
+	 (eq_attr "move_type" "signext") (const_string "signext")
+	 (eq_attr "move_type" "pick_ins") (const_string "arith")
+	 (eq_attr "move_type" "arith") (const_string "arith")
+	 (eq_attr "move_type" "logical") (const_string "logical")
+	 (eq_attr "move_type" "sll0") (const_string "shift")
+	 (eq_attr "move_type" "andi") (const_string "logical")
+
+	 ;; These types of move are always split.
+	 (eq_attr "move_type" "shift_shift")
+	   (const_string "multi")
+
+	 ;; These types of move are split for quadword modes only.
+	 (and (eq_attr "move_type" "move,const")
+	      (eq_attr "qword_mode" "yes"))
+	   (const_string "multi")
+
+	 ;; These types of move are split for doubleword modes only.
+	 (and (eq_attr "move_type" "move,const")
+	      (eq_attr "dword_mode" "yes"))
+	   (const_string "multi")
+	 (eq_attr "move_type" "move") (const_string "move")
+	 (eq_attr "move_type" "const") (const_string "const")]
+	(const_string "unknown")))
+
+;; Mode for conversion types (fcvt)
+;; I2S	integer to float single (SI/DI to SF)
+;; I2D	integer to float double (SI/DI to DF)
+;; S2I	float to integer (SF to SI/DI)
+;; D2I	float to integer (DF to SI/DI)
+;; D2S	double to float single
+;; S2D	float single to double
+
+(define_attr "cnv_mode" "unknown,I2S,I2D,S2I,D2I,D2S,S2D"
+  (const_string "unknown"))
+
+;; The number of individual instructions that a non-branch pattern generates
+(define_attr "insn_count" ""
+  (cond [;; "Ghost" instructions occupy no space.
+	 (eq_attr "type" "ghost")
+	 (const_int 0)
+
+	 ;; Check for doubleword moves that are decomposed into two
+	 ;; instructions.
+	 (and (eq_attr "move_type" "mgtf,mftg,move")
+	      (eq_attr "dword_mode" "yes"))
+	 (const_int 2)
+
+	 ;; Check for quadword moves that are decomposed into four
+	 ;; instructions.
+	 (and (eq_attr "move_type" "mgtf,mftg,move")
+	      (eq_attr "qword_mode" "yes"))
+	 (const_int 4)
+
+	 ;; Constants, loads and stores are handled by external routines.
+	 (and (eq_attr "move_type" "const")
+	      (eq_attr "dword_mode" "yes"))
+	 (symbol_ref "loongarch_split_const_insns (operands[1])")
+	 (eq_attr "move_type" "const")
+	 (symbol_ref "loongarch_const_insns (operands[1])")
+	 (eq_attr "move_type" "load,fpload")
+	 (symbol_ref "loongarch_load_store_insns (operands[1], insn)")
+	 (eq_attr "move_type" "store,fpstore")
+	 (symbol_ref "loongarch_load_store_insns (operands[0], insn)")
+
+	 (eq_attr "type" "idiv")
+	 (symbol_ref "loongarch_idiv_insns (GET_MODE (PATTERN (insn)))")]
+(const_int 1)))
+
+;; Length of instruction in bytes.
+(define_attr "length" ""
+   (cond [
+	  ;; Branch futher than +/- 128 KiB require two instructions.
+	  (eq_attr "type" "branch")
+	  (if_then_else (and (le (minus (match_dup 0) (pc)) (const_int 131064))
+			     (le (minus (pc) (match_dup 0)) (const_int 131068)))
+	  (const_int 4)
+	  (const_int 8))]
+    (symbol_ref "get_attr_insn_count (insn) * 4")))
+
+;; Describe a user's asm statement.
+(define_asm_attributes
+  [(set_attr "type" "multi")])
+
+;; This mode iterator allows 32-bit and 64-bit GPR patterns to be generated
+;; from the same template.
+(define_mode_iterator GPR [SI (DI "TARGET_64BIT")])
+
+;; A copy of GPR that can be used when a pattern has two independent
+;; modes.
+(define_mode_iterator GPR2 [SI (DI "TARGET_64BIT")])
+
+;; This mode iterator allows 16-bit and 32-bit GPR patterns and 32-bit 64-bit
+;; FPR patterns to be generated from the same template.
+(define_mode_iterator JOIN_MODE [HI
+				 SI
+				 (SF "TARGET_HARD_FLOAT")
+				 (DF "TARGET_DOUBLE_FLOAT")])
+
+;; This mode iterator allows :P to be used for patterns that operate on
+;; pointer-sized quantities.  Exactly one of the two alternatives will match.
+(define_mode_iterator P [(SI "Pmode == SImode") (DI "Pmode == DImode")])
+
+;; Likewise, but for XLEN-sized quantities.
+(define_mode_iterator X [(SI "!TARGET_64BIT") (DI "TARGET_64BIT")])
+
+;; 64-bit modes for which we provide move patterns.
+(define_mode_iterator MOVE64 [DI DF])
+
+;; 128-bit modes for which we provide move patterns on 64-bit targets.
+(define_mode_iterator MOVE128 [TI TF])
+
+;; Iterator for sub-32-bit integer modes.
+(define_mode_iterator SHORT [QI HI])
+
+;; Likewise the 64-bit truncate-and-shift patterns.
+(define_mode_iterator SUBDI [QI HI SI])
+
+;; Iterator for scalar fixed point modes.
+(define_mode_iterator QHWD [QI HI SI (DI "TARGET_64BIT")])
+
+;; Iterator for hardware-supported floating-point modes.
+(define_mode_iterator ANYF [(SF "TARGET_HARD_FLOAT")
+			    (DF "TARGET_DOUBLE_FLOAT")])
+
+;; A mode for which moves involving FPRs may need to be split.
+(define_mode_iterator SPLITF
+  [(DF "!TARGET_64BIT && TARGET_DOUBLE_FLOAT")
+   (DI "!TARGET_64BIT && TARGET_DOUBLE_FLOAT")
+   (TF "TARGET_64BIT && TARGET_DOUBLE_FLOAT")])
+
+;; In GPR templates, a string like "mul.<d>" will expand to "mul.w" in the
+;; 32-bit version and "mul.d" in the 64-bit version.
+(define_mode_attr d [(SI "w") (DI "d")])
+
+;; This attribute gives the length suffix for a load or store instruction.
+;; The same suffixes work for zero and sign extensions.
+(define_mode_attr size [(QI "b") (HI "h") (SI "w") (DI "d")])
+(define_mode_attr SIZE [(QI "B") (HI "H") (SI "W") (DI "D")])
+
+;; This attribute gives the mode mask of a SHORT.
+(define_mode_attr mask [(QI "0x00ff") (HI "0xffff")])
+
+;; This attribute gives the size (bits) of a SHORT.
+(define_mode_attr 7_or_15 [(QI "7") (HI "15")])
+
+;; Instruction names for stores.
+(define_mode_attr store [(QI "sb") (HI "sh") (SI "sw") (DI "sd")])
+
+;; This attribute gives the format suffix for floating-point operations.
+(define_mode_attr fmt [(SF "s") (DF "d")])
+(define_mode_attr ifmt [(SI "w") (DI "l")])
+
+;; This attribute gives the upper-case mode name for one unit of a
+;; floating-point mode or vector mode.
+(define_mode_attr UNITMODE [(SF "SF") (DF "DF")])
+
+;; This attribute gives the integer mode that has half the size of
+;; the controlling mode.
+(define_mode_attr HALFMODE [(DF "SI") (DI "SI") (TF "DI")])
+
+;; This code iterator allows signed and unsigned widening multiplications
+;; to use the same template.
+(define_code_iterator any_extend [sign_extend zero_extend])
+
+;; This code iterator allows the two right shift instructions to be
+;; generated from the same template.
+(define_code_iterator any_shiftrt [ashiftrt lshiftrt])
+
+;; This code iterator allows the three shift instructions to be generated
+;; from the same template.
+(define_code_iterator any_shift [ashift ashiftrt lshiftrt])
+
+;; This code iterator allows the three bitwise instructions to be generated
+;; from the same template.
+(define_code_iterator any_bitwise [and ior xor])
+(define_code_iterator neg_bitwise [and ior])
+
+;; This code iterator allows unsigned and signed division to be generated
+;; from the same template.
+(define_code_iterator any_div [div udiv mod umod])
+
+;; This code iterator allows all native floating-point comparisons to be
+;; generated from the same template.
+(define_code_iterator fcond [unordered uneq unlt unle eq lt le
+			     ordered ltgt ne ge gt unge ungt])
+
+;; Equality operators.
+(define_code_iterator equality_op [eq ne])
+
+;; These code iterators allow the signed and unsigned scc operations to use
+;; the same template.
+(define_code_iterator any_gt [gt gtu])
+(define_code_iterator any_ge [ge geu])
+(define_code_iterator any_lt [lt ltu])
+(define_code_iterator any_le [le leu])
+
+(define_code_iterator any_return [return simple_return])
+
+;; <u> expands to an empty string when doing a signed operation and
+;; "u" when doing an unsigned operation.
+(define_code_attr u [(sign_extend "") (zero_extend "u")
+		     (div "") (udiv "u")
+		     (mod "") (umod "u")
+		     (gt "") (gtu "u")
+		     (ge "") (geu "u")
+		     (lt "") (ltu "u")
+		     (le "") (leu "u")])
+
+;; <U> is like <u> except uppercase.
+(define_code_attr U [(sign_extend "") (zero_extend "U")])
+
+;; <su> is like <u>, but the signed form expands to "s" rather than "".
+(define_code_attr su [(sign_extend "s") (zero_extend "u")])
+
+;; <optab> expands to the name of the optab for a particular code.
+(define_code_attr optab [(ashift "ashl")
+			 (ashiftrt "ashr")
+			 (lshiftrt "lshr")
+			 (ior "ior")
+			 (xor "xor")
+			 (and "and")
+			 (plus "add")
+			 (minus "sub")
+			 (mult "mul")
+			 (div "div")
+			 (udiv "udiv")
+			 (mod "mod")
+			 (umod "umod")
+			 (return "return")
+			 (simple_return "simple_return")])
+
+;; <insn> expands to the name of the insn that implements a particular code.
+(define_code_attr insn [(ashift "sll")
+			(ashiftrt "sra")
+			(lshiftrt "srl")
+			(ior "or")
+			(xor "xor")
+			(and "and")
+			(plus "addu")
+			(minus "subu")
+			(div "div")
+			(udiv "div")
+			(mod "mod")
+			(umod "mod")])
+
+;; <fcond> is the fcmp.cond.fmt condition associated with a particular code.
+(define_code_attr fcond [(unordered "cun")
+			 (uneq "cueq")
+			 (unlt "cult")
+			 (unle "cule")
+			 (eq "ceq")
+			 (lt "slt")
+			 (le "sle")
+			 (ordered "cor")
+			 (ltgt "sne")
+			 (ne "cune")
+			 (ge "sge")
+			 (gt "sgt")
+			 (unge "cuge")
+			 (ungt "cugt")])
+
+;; The sel mnemonic to use depending on the condition test.
+(define_code_attr sel [(eq "masknez") (ne "maskeqz")])
+(define_code_attr selinv [(eq "maskeqz") (ne "masknez")])
+
+;;
+;;  ....................
+;;
+;;	CONDITIONAL TRAPS
+;;
+;;  ....................
+;;
+
+(define_insn "trap"
+  [(trap_if (const_int 1) (const_int 0))]
+  ""
+{
+  return "break\t0";
+}
+  [(set_attr "type" "trap")])
+
+
+
+;;
+;;  ....................
+;;
+;;	ADDITION
+;;
+;;  ....................
+;;
+
+(define_insn "add<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(plus:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		   (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fadd.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fadd")
+   (set_attr "mode" "<UNITMODE>")])
+
+(define_insn "add<mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r")
+	(plus:GPR (match_operand:GPR 1 "register_operand" "r,r")
+		  (match_operand:GPR 2 "arith_operand" "r,I")))]
+  ""
+  "add%i2.<d>\t%0,%1,%2";
+  [(set_attr "alu_type" "add")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*addsi3_extended"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(sign_extend:DI
+	     (plus:SI (match_operand:SI 1 "register_operand" "r,r")
+		      (match_operand:SI 2 "arith_operand" "r,I"))))]
+  "TARGET_64BIT"
+  "add%i2.w\t%0,%1,%2"
+  [(set_attr "alu_type" "add")
+   (set_attr "mode" "SI")])
+
+
+;;
+;;  ....................
+;;
+;;	SUBTRACTION
+;;
+;;  ....................
+;;
+
+(define_insn "sub<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(minus:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		    (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fsub.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fadd")
+   (set_attr "mode" "<UNITMODE>")])
+
+(define_insn "sub<mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(minus:GPR (match_operand:GPR 1 "register_operand" "rJ")
+		   (match_operand:GPR 2 "register_operand" "r")))]
+  ""
+  "sub.<d>\t%0,%z1,%2"
+  [(set_attr "alu_type" "sub")
+   (set_attr "mode" "<MODE>")])
+
+
+(define_insn "*subsi3_extended"
+  [(set (match_operand:DI 0 "register_operand" "= r")
+	(sign_extend:DI
+	    (minus:SI (match_operand:SI 1 "reg_or_0_operand" " rJ")
+		      (match_operand:SI 2 "register_operand" "  r"))))]
+  "TARGET_64BIT"
+  "sub.w\t%0,%z1,%2"
+  [(set_attr "type" "arith")
+   (set_attr "mode" "SI")])
+
+;;
+;;  ....................
+;;
+;;	MULTIPLICATION
+;;
+;;  ....................
+;;
+
+(define_insn "mul<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(mult:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		   (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fmul.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fmul")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "mul<mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(mult:GPR (match_operand:GPR 1 "register_operand" "r")
+		  (match_operand:GPR 2 "register_operand" "r")))]
+  ""
+  "mul.<d>\t%0,%1,%2"
+  [(set_attr "type" "imul")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "mulsidi3_64bit"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(mult:DI (sign_extend:DI (match_operand:SI 1 "register_operand" "r"))
+		 (sign_extend:DI (match_operand:SI 2 "register_operand" "r"))))]
+  "TARGET_64BIT"
+  "mul.d\t%0,%1,%2"
+  [(set_attr "type" "imul")
+   (set_attr "mode" "DI")])
+
+(define_insn "*mulsi3_extended"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(sign_extend:DI
+	    (mult:SI (match_operand:SI 1 "register_operand" "r")
+		     (match_operand:SI 2 "register_operand" "r"))))]
+  "TARGET_64BIT"
+  "mul.w\t%0,%1,%2"
+  [(set_attr "type" "imul")
+   (set_attr "mode" "SI")])
+
+;;
+;;  ........................
+;;
+;;	MULTIPLICATION HIGH-PART
+;;
+;;  ........................
+;;
+
+(define_expand "<u>mulditi3"
+  [(set (match_operand:TI 0 "register_operand")
+	(mult:TI (any_extend:TI (match_operand:DI 1 "register_operand"))
+		 (any_extend:TI (match_operand:DI 2 "register_operand"))))]
+  "TARGET_64BIT"
+{
+  rtx low = gen_reg_rtx (DImode);
+  emit_insn (gen_muldi3 (low, operands[1], operands[2]));
+
+  rtx high = gen_reg_rtx (DImode);
+  emit_insn (gen_<u>muldi3_highpart (high, operands[1], operands[2]));
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
+  emit_move_insn (gen_highpart (DImode, operands[0]), high);
+  DONE;
+})
+
+(define_insn "<u>muldi3_highpart"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(truncate:DI
+	  (lshiftrt:TI
+	    (mult:TI (any_extend:TI
+		       (match_operand:DI 1 "register_operand" " r"))
+		     (any_extend:TI
+		       (match_operand:DI 2 "register_operand" " r")))
+	    (const_int 64))))]
+  "TARGET_64BIT"
+  "mulh.d<u>\t%0,%1,%2"
+  [(set_attr "type" "imul")
+   (set_attr "mode" "DI")])
+
+(define_expand "<u>mulsidi3"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(mult:DI (any_extend:DI
+		   (match_operand:SI 1 "register_operand" " r"))
+		 (any_extend:DI
+		   (match_operand:SI 2 "register_operand" " r"))))]
+  "!TARGET_64BIT"
+{
+  rtx temp = gen_reg_rtx (SImode);
+  emit_insn (gen_mulsi3 (temp, operands[1], operands[2]));
+  emit_insn (gen_<u>mulsi3_highpart (loongarch_subword (operands[0], true),
+				     operands[1], operands[2]));
+  emit_insn (gen_movsi (loongarch_subword (operands[0], false), temp));
+  DONE;
+})
+
+(define_insn "<u>mulsi3_highpart"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (mult:DI (any_extend:DI
+		       (match_operand:SI 1 "register_operand" " r"))
+		     (any_extend:DI
+		       (match_operand:SI 2 "register_operand" " r")))
+	    (const_int 32))))]
+  "!TARGET_64BIT"
+  "mulh.w<u>\t%0,%1,%2"
+  [(set_attr "type" "imul")
+   (set_attr "mode" "SI")])
+
+;;
+;;  ....................
+;;
+;;	DIVISION and REMAINDER
+;;
+;;  ....................
+;;
+
+;; Float division and modulus.
+(define_expand "div<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand")
+	(div:ANYF (match_operand:ANYF 1 "reg_or_1_operand")
+		  (match_operand:ANYF 2 "register_operand")))]
+  "")
+
+(define_insn "*div<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(div:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		  (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fdiv.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fdiv")
+   (set_attr "mode" "<UNITMODE>")])
+
+;; In 3A5000, the reciprocal operation is the same as the division operation.
+
+(define_insn "*recip<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(div:ANYF (match_operand:ANYF 1 "const_1_operand" "")
+		  (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "frecip.<fmt>\t%0,%2"
+  [(set_attr "type" "frdiv")
+   (set_attr "mode" "<UNITMODE>")])
+
+;; Integer division and modulus.
+(define_expand "<optab><mode>3"
+  [(set (match_operand:GPR 0 "register_operand")
+	(any_div:GPR (match_operand:GPR 1 "register_operand")
+		     (match_operand:GPR 2 "register_operand")))]
+  ""
+{
+ if (GET_MODE (operands[0]) == SImode)
+  {
+    rtx reg1 = gen_reg_rtx (DImode);
+    rtx reg2 = gen_reg_rtx (DImode);
+
+    operands[1] = gen_rtx_SIGN_EXTEND (word_mode, operands[1]);
+    operands[2] = gen_rtx_SIGN_EXTEND (word_mode, operands[2]);
+
+    emit_insn (gen_rtx_SET (reg1, operands[1]));
+    emit_insn (gen_rtx_SET (reg2, operands[2]));
+
+    emit_insn (gen_<optab>di3_fake (operands[0], reg1, reg2));
+    DONE;
+  }
+})
+
+(define_insn "*<optab><mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")
+	(any_div:GPR (match_operand:GPR 1 "register_operand" "r")
+		     (match_operand:GPR 2 "register_operand" "r")))]
+  ""
+{
+  return loongarch_output_division ("<insn>.<d><u>\t%0,%1,%2", operands);
+}
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<optab>di3_fake"
+  [(set (match_operand:SI 0 "register_operand" "=&r")
+	(any_div:SI (match_operand:DI 1 "register_operand" "r")
+		    (match_operand:DI 2 "register_operand" "r")))]
+  ""
+{
+  return loongarch_output_division ("<insn>.w<u>\t%0,%1,%2", operands);
+}
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "SI")])
+
+;; Floating point multiply accumulate instructions.
+
+;; a * b + c
+(define_insn "fma<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (match_operand:ANYF 3 "register_operand" "f")))]
+  ""
+  "fmadd.<fmt>\t%0,%1,%2,%3"
+  [(set_attr "type" "fmadd")
+   (set_attr "mode" "<UNITMODE>")])
+
+;; a * b - c
+(define_insn "fms<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (neg:ANYF (match_operand:ANYF 3 "register_operand" "f"))))]
+  ""
+  "fmsub.<fmt>\t%0,%1,%2,%3"
+  [(set_attr "type" "fmadd")
+   (set_attr "mode" "<UNITMODE>")])
+
+;; fnma is defined in GCC as (fma (neg op1) op2 op3)
+;; (-op1 * op2) + op3 ==> -(op1 * op2) + op3 ==> -((op1 * op2) - op3)
+;; The loongarch nmsub instructions implement -((op1 * op2) - op3)
+;; This transformation means we may return the wrong signed zero
+;; so we check HONOR_SIGNED_ZEROS.
+
+;; -a * b + c
+(define_insn "fnma<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (neg:ANYF (match_operand:ANYF 1 "register_operand" "f"))
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (match_operand:ANYF 3 "register_operand" "f")))]
+  "!HONOR_SIGNED_ZEROS (<MODE>mode)"
+  "fnmsub.<fmt>\t%0,%1,%2,%3"
+  [(set_attr "type" "fmadd")
+   (set_attr "mode" "<UNITMODE>")])
+
+;; fnms is defined as: (fma (neg op1) op2 (neg op3))
+;; ((-op1) * op2) - op3 ==> -(op1 * op2) - op3 ==> -((op1 * op2) + op3)
+;; The loongarch nmadd instructions implement -((op1 * op2) + op3)
+;; This transformation means we may return the wrong signed zero
+;; so we check HONOR_SIGNED_ZEROS.
+
+;; -a * b - c
+(define_insn "fnms<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF
+	    (neg:ANYF (match_operand:ANYF 1 "register_operand" "f"))
+	    (match_operand:ANYF 2 "register_operand" "f")
+	    (neg:ANYF (match_operand:ANYF 3 "register_operand" "f"))))]
+  "!HONOR_SIGNED_ZEROS (<MODE>mode)"
+  "fnmadd.<fmt>\t%0,%1,%2,%3"
+  [(set_attr "type" "fmadd")
+   (set_attr "mode" "<UNITMODE>")])
+
+;; -(-a * b - c), modulo signed zeros
+(define_insn "*fma<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(neg:ANYF
+	    (fma:ANYF
+		(neg:ANYF (match_operand:ANYF 1 "register_operand" " f"))
+		(match_operand:ANYF 2 "register_operand" " f")
+		(neg:ANYF (match_operand:ANYF 3 "register_operand" " f")))))]
+  "!HONOR_SIGNED_ZEROS (<MODE>mode)"
+  "fmadd.<fmt>\t%0,%1,%2,%3"
+  [(set_attr "type" "fmadd")
+   (set_attr "mode" "<UNITMODE>")])
+
+;; -(-a * b + c), modulo signed zeros
+(define_insn "*fms<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(neg:ANYF
+	    (fma:ANYF
+		(neg:ANYF (match_operand:ANYF 1 "register_operand" " f"))
+		(match_operand:ANYF 2 "register_operand" " f")
+		(match_operand:ANYF 3 "register_operand" " f"))))]
+  "!HONOR_SIGNED_ZEROS (<MODE>mode)"
+  "fmsub.<fmt>\t%0,%1,%2,%3"
+  [(set_attr "type" "fmadd")
+   (set_attr "mode" "<UNITMODE>")])
+
+;; -(a * b + c)
+(define_insn "*fnms<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(neg:ANYF
+	    (fma:ANYF
+		(match_operand:ANYF 1 "register_operand" " f")
+		(match_operand:ANYF 2 "register_operand" " f")
+		(match_operand:ANYF 3 "register_operand" " f"))))]
+  ""
+  "fnmadd.<fmt>\t%0,%1,%2,%3"
+  [(set_attr "type" "fmadd")
+   (set_attr "mode" "<UNITMODE>")])
+
+;; -(a * b - c)
+(define_insn "*fnma<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(neg:ANYF
+	    (fma:ANYF
+		(match_operand:ANYF 1 "register_operand" " f")
+		(match_operand:ANYF 2 "register_operand" " f")
+		(neg:ANYF (match_operand:ANYF 3 "register_operand" " f")))))]
+  ""
+  "fnmsub.<fmt>\t%0,%1,%2,%3"
+  [(set_attr "type" "fmadd")
+   (set_attr "mode" "<UNITMODE>")])
+
+;;
+;;  ....................
+;;
+;;	SQUARE ROOT
+;;
+;;  ....................
+
+(define_insn "sqrt<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(sqrt:ANYF (match_operand:ANYF 1 "register_operand" "f")))]
+  ""
+  "fsqrt.<fmt>\t%0,%1"
+  [(set_attr "type" "fsqrt")
+   (set_attr "mode" "<UNITMODE>")
+   (set_attr "insn_count" "1")])
+
+(define_insn "*rsqrt<mode>a"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(div:ANYF (match_operand:ANYF 1 "const_1_operand" "")
+		  (sqrt:ANYF (match_operand:ANYF 2 "register_operand" "f"))))]
+  "flag_unsafe_math_optimizations"
+  "frsqrt.<fmt>\t%0,%2"
+  [(set_attr "type" "frsqrt")
+   (set_attr "mode" "<UNITMODE>")
+   (set_attr "insn_count" "1")])
+
+(define_insn "*rsqrt<mode>b"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(sqrt:ANYF (div:ANYF (match_operand:ANYF 1 "const_1_operand" "")
+			     (match_operand:ANYF 2 "register_operand" "f"))))]
+  "flag_unsafe_math_optimizations"
+  "frsqrt.<fmt>\t%0,%2"
+  [(set_attr "type" "frsqrt")
+   (set_attr "mode" "<UNITMODE>")
+   (set_attr "insn_count" "1")])
+
+;;
+;;  ....................
+;;
+;;	ABSOLUTE VALUE
+;;
+;;  ....................
+
+(define_insn "abs<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(abs:ANYF (match_operand:ANYF 1 "register_operand" "f")))]
+  ""
+  "fabs.<fmt>\t%0,%1"
+  [(set_attr "type" "fabs")
+   (set_attr "mode" "<UNITMODE>")])
+
+;;
+;;  ...................
+;;
+;;  Count leading zeroes.
+;;
+;;  ...................
+;;
+
+(define_insn "clz<mode>2"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(clz:GPR (match_operand:GPR 1 "register_operand" "r")))]
+  ""
+  "clz.<d>\t%0,%1"
+  [(set_attr "type" "clz")
+   (set_attr "mode" "<MODE>")])
+
+;;
+;;  ...................
+;;
+;;  Count trailing zeroes.
+;;
+;;  ...................
+;;
+
+(define_insn "ctz<mode>2"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(ctz:GPR (match_operand:GPR 1 "register_operand" "r")))]
+  ""
+  "ctz.<d>\t%0,%1"
+  [(set_attr "type" "clz")
+   (set_attr "mode" "<MODE>")])
+
+;;
+;;  ....................
+;;
+;;	MIN/MAX
+;;
+;;  ....................
+
+(define_insn "smax<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(smax:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		   (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fmax.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fmove")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "smin<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(smin:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		   (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fmin.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fmove")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "smaxa<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(if_then_else:ANYF
+	      (gt (abs:ANYF (match_operand:ANYF 1 "register_operand" "f"))
+		  (abs:ANYF (match_operand:ANYF 2 "register_operand" "f")))
+	      (match_dup 1)
+	      (match_dup 2)))]
+  ""
+  "fmaxa.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fmove")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "smina<mode>3"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(if_then_else:ANYF
+		(lt (abs:ANYF (match_operand:ANYF 1 "register_operand" "f"))
+		    (abs:ANYF (match_operand:ANYF 2 "register_operand" "f")))
+		(match_dup 1)
+		(match_dup 2)))]
+  ""
+  "fmina.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fmove")
+   (set_attr "mode" "<MODE>")])
+
+;;
+;;  ....................
+;;
+;;	NEGATION and ONE'S COMPLEMENT
+;;
+;;  ....................
+
+(define_insn "neg<mode>2"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(neg:GPR (match_operand:GPR 1 "register_operand" "r")))]
+  ""
+  "sub.<d>\t%0,%.,%1"
+  [(set_attr "alu_type"	"sub")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "one_cmpl<mode>2"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(not:GPR (match_operand:GPR 1 "register_operand" "r")))]
+  ""
+  "nor\t%0,%.,%1"
+  [(set_attr "alu_type" "not")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "neg<mode>2"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(neg:ANYF (match_operand:ANYF 1 "register_operand" "f")))]
+  ""
+  "fneg.<fmt>\t%0,%1"
+  [(set_attr "type" "fneg")
+   (set_attr "mode" "<UNITMODE>")])
+
+
+;;
+;;  ....................
+;;
+;;	LOGICAL
+;;
+;;  ....................
+;;
+
+(define_insn "<optab><mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r")
+	(any_bitwise:GPR (match_operand:GPR 1 "register_operand" "%r,r")
+			 (match_operand:GPR 2 "uns_arith_operand" "r,K")))]
+  ""
+  "<insn>%i2\t%0,%1,%2"
+  [(set_attr "type" "logical")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "and<mode>3_extended"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(and:GPR (match_operand:GPR 1 "nonimmediate_operand" "r")
+		 (match_operand:GPR 2 "low_bitmask_operand" "Yx")))]
+  ""
+{
+  int len;
+
+  len = low_bitmask_len (<MODE>mode, INTVAL (operands[2]));
+  operands[2] = GEN_INT (len-1);
+  return "bstrpick.<d>\t%0,%1,%2,0";
+}
+  [(set_attr "move_type" "pick_ins")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*iorhi3"
+  [(set (match_operand:HI 0 "register_operand" "=r,r")
+	(ior:HI (match_operand:HI 1 "register_operand" "%r,r")
+		(match_operand:HI 2 "uns_arith_operand" "r,K")))]
+  ""
+  "or%i2\t%0,%1,%2"
+  [(set_attr "type" "logical")
+   (set_attr "mode" "HI")])
+
+(define_insn "*nor<mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(and:GPR (not:GPR (match_operand:GPR 1 "register_operand" "%r"))
+		 (not:GPR (match_operand:GPR 2 "register_operand" "r"))))]
+  ""
+  "nor\t%0,%1,%2"
+  [(set_attr "type" "logical")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<optab>n<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(neg_bitwise:GPR
+	    (not:GPR (match_operand:GPR 1 "register_operand" "r"))
+	    (match_operand:GPR 2 "register_operand" "r")))]
+  ""
+  "<insn>n\t%0,%2,%1"
+  [(set_attr "type" "logical")
+   (set_attr "mode" "<MODE>")])
+
+
+;;
+;;  ....................
+;;
+;;	TRUNCATION
+;;
+;;  ....................
+
+(define_insn "truncdfsf2"
+  [(set (match_operand:SF 0 "register_operand" "=f")
+	(float_truncate:SF (match_operand:DF 1 "register_operand" "f")))]
+  "TARGET_DOUBLE_FLOAT"
+  "fcvt.s.d\t%0,%1"
+  [(set_attr "type" "fcvt")
+   (set_attr "cnv_mode"	"D2S")
+   (set_attr "mode" "SF")])
+
+
+;;
+;;  ....................
+;;
+;;	ZERO EXTENSION
+;;
+;;  ....................
+(define_expand "zero_extendsidi2"
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extend:DI (match_operand:SI 1 "nonimmediate_operand")))]
+  "TARGET_64BIT")
+
+(define_insn_and_split "*zero_extendsidi2_internal"
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r,r")
+	(zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "r,m,ZC,k")))]
+  "TARGET_64BIT"
+  "@
+   bstrpick.d\t%0,%1,31,0
+   ld.wu\t%0,%1
+   #
+   ldx.wu\t%0,%1"
+  "&& reload_completed
+   && MEM_P (operands[1])
+   && (loongarch_14bit_shifted_offset_address_p (XEXP (operands[1], 0), SImode)
+       && !loongarch_12bit_offset_address_p (XEXP (operands[1], 0), SImode))
+   && !paradoxical_subreg_p (operands[0])"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 0)
+	(ior:DI (zero_extend:DI
+		  (subreg:SI (match_dup 0) 0))
+		(match_dup 2)))]
+  {
+    operands[1] = gen_lowpart (SImode, operands[1]);
+    operands[3] = gen_lowpart (SImode, operands[0]);
+    operands[2] = const0_rtx;
+  }
+  [(set_attr "move_type" "arith,load,load,load")
+   (set_attr "mode" "DI")])
+
+(define_insn "zero_extend<SHORT:mode><GPR:mode>2"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r,r")
+	(zero_extend:GPR
+	     (match_operand:SHORT 1 "nonimmediate_operand" "r,m,k")))]
+  ""
+  "@
+   bstrpick.w\t%0,%1,<SHORT:7_or_15>,0
+   ld.<SHORT:size>u\t%0,%1
+   ldx.<SHORT:size>u\t%0,%1"
+  [(set_attr "move_type" "pick_ins,load,load")
+   (set_attr "mode" "<GPR:MODE>")])
+
+(define_insn "zero_extendqihi2"
+  [(set (match_operand:HI 0 "register_operand" "=r,r,r")
+	(zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" "r,k,m")))]
+  ""
+  "@
+   andi\t%0,%1,0xff
+   ldx.bu\t%0,%1
+   ld.bu\t%0,%1"
+  [(set_attr "move_type" "andi,load,load")
+   (set_attr "mode" "HI")])
+
+;; Combiner patterns to optimize truncate/zero_extend combinations.
+
+(define_insn "*zero_extend<GPR:mode>_trunc<SHORT:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(zero_extend:GPR
+	    (truncate:SHORT (match_operand:DI 1 "register_operand" "r"))))]
+  "TARGET_64BIT"
+  "bstrpick.w\t%0,%1,<SHORT:7_or_15>,0"
+  [(set_attr "move_type" "pick_ins")
+   (set_attr "mode" "<GPR:MODE>")])
+
+(define_insn "*zero_extendhi_truncqi"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(zero_extend:HI
+	    (truncate:QI (match_operand:DI 1 "register_operand" "r"))))]
+  "TARGET_64BIT"
+  "andi\t%0,%1,0xff"
+  [(set_attr "alu_type" "and")
+   (set_attr "mode" "HI")])
+
+;;
+;;  ....................
+;;
+;;	SIGN EXTENSION
+;;
+;;  ....................
+
+(define_insn "extendsidi2"
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r,r")
+	(sign_extend:DI
+	    (match_operand:SI 1 "nonimmediate_operand" "r,ZC,m,k")))]
+  "TARGET_64BIT"
+  "@
+   slli.w\t%0,%1,0
+   ldptr.w\t%0,%1
+   ld.w\t%0,%1
+   ldx.w\t%0,%1"
+  [(set_attr "move_type" "sll0,load,load,load")
+   (set_attr "mode" "DI")])
+
+(define_insn "extend<SHORT:mode><GPR:mode>2"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r,r")
+	(sign_extend:GPR
+	     (match_operand:SHORT 1 "nonimmediate_operand" "r,m,k")))]
+  ""
+  "@
+   ext.w.<SHORT:size>\t%0,%1
+   ld.<SHORT:size>\t%0,%1
+   ldx.<SHORT:size>\t%0,%1"
+  [(set_attr "move_type" "signext,load,load")
+   (set_attr "mode" "<GPR:MODE>")])
+
+(define_insn "extendqihi2"
+  [(set (match_operand:HI 0 "register_operand" "=r,r,r")
+	(sign_extend:HI
+	     (match_operand:QI 1 "nonimmediate_operand" "r,m,k")))]
+  ""
+  "@
+   ext.w.b\t%0,%1
+   ld.b\t%0,%1
+   ldx.b\t%0,%1"
+  [(set_attr "move_type" "signext,load,load")
+   (set_attr "mode" "SI")])
+
+(define_insn "extendsfdf2"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(float_extend:DF (match_operand:SF 1 "register_operand" "f")))]
+  "TARGET_DOUBLE_FLOAT"
+  "fcvt.d.s\t%0,%1"
+  [(set_attr "type" "fcvt")
+   (set_attr "cnv_mode" "S2D")
+   (set_attr "mode" "DF")])
+
+;;
+;;  ....................
+;;
+;;	CONVERSIONS
+;;
+;;  ....................
+
+;; conversion of a floating-point value to a integer
+
+(define_insn "fix_trunc<ANYF:mode><GPR:mode>2"
+  [(set (match_operand:GPR 0 "register_operand" "=f")
+	(fix:GPR (match_operand:ANYF 1 "register_operand" "f")))]
+  ""
+  "ftintrz.<GPR:ifmt>.<ANYF:fmt> %0,%1"
+  [(set_attr "type" "fcvt")
+   (set_attr "mode" "<ANYF:MODE>")])
+
+;; conversion of an integeral (or boolean) value to a floating-point value
+
+(define_insn "floatsidf2"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(float:DF (match_operand:SI 1 "register_operand" "f")))]
+  "TARGET_DOUBLE_FLOAT"
+  "ffint.d.w\t%0,%1"
+  [(set_attr "type" "fcvt")
+   (set_attr "mode" "DF")
+   (set_attr "cnv_mode"	"I2D")])
+
+(define_insn "floatdidf2"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(float:DF (match_operand:DI 1 "register_operand" "f")))]
+  "TARGET_DOUBLE_FLOAT"
+  "ffint.d.l\t%0,%1"
+  [(set_attr "type" "fcvt")
+   (set_attr "mode" "DF")
+   (set_attr "cnv_mode" "I2D")])
+
+(define_insn "floatsisf2"
+  [(set (match_operand:SF 0 "register_operand" "=f")
+	(float:SF (match_operand:SI 1 "register_operand" "f")))]
+  "TARGET_HARD_FLOAT"
+  "ffint.s.w\t%0,%1"
+  [(set_attr "type" "fcvt")
+   (set_attr "mode" "SF")
+   (set_attr "cnv_mode"	"I2S")])
+
+(define_insn "floatdisf2"
+  [(set (match_operand:SF 0 "register_operand" "=f")
+	(float:SF (match_operand:DI 1 "register_operand" "f")))]
+  "TARGET_DOUBLE_FLOAT"
+  "ffint.s.l\t%0,%1"
+  [(set_attr "type" "fcvt")
+   (set_attr "mode" "SF")
+   (set_attr "cnv_mode"	"I2S")])
+
+;; Convert a floating-point value to an unsigned integer.
+
+(define_expand "fixuns_truncdfsi2"
+  [(set (match_operand:SI 0 "register_operand")
+	(unsigned_fix:SI (match_operand:DF 1 "register_operand")))]
+  "TARGET_DOUBLE_FLOAT"
+{
+  rtx reg1 = gen_reg_rtx (DFmode);
+  rtx reg2 = gen_reg_rtx (DFmode);
+  rtx reg3 = gen_reg_rtx (SImode);
+  rtx_code_label *label1 = gen_label_rtx ();
+  rtx_code_label *label2 = gen_label_rtx ();
+  rtx test;
+  REAL_VALUE_TYPE offset;
+
+  real_2expN (&offset, 31, DFmode);
+
+  loongarch_emit_move (reg1,
+		       const_double_from_real_value (offset, DFmode));
+  do_pending_stack_adjust ();
+
+  test = gen_rtx_GE (VOIDmode, operands[1], reg1);
+  emit_jump_insn (gen_cbranchdf4 (test, operands[1], reg1, label1));
+
+  emit_insn (gen_fix_truncdfsi2 (operands[0], operands[1]));
+  emit_jump_insn (gen_rtx_SET (pc_rtx,
+			       gen_rtx_LABEL_REF (VOIDmode, label2)));
+  emit_barrier ();
+
+  emit_label (label1);
+  loongarch_emit_move (reg2, gen_rtx_MINUS (DFmode, operands[1], reg1));
+  loongarch_emit_move (reg3, GEN_INT (trunc_int_for_mode
+					(BITMASK_HIGH, SImode)));
+
+  emit_insn (gen_fix_truncdfsi2 (operands[0], reg2));
+  emit_insn (gen_iorsi3 (operands[0], operands[0], reg3));
+
+  emit_label (label2);
+
+  /* Allow REG_NOTES to be set on last insn (labels don't have enough
+     fields, and can't be used for REG_NOTES anyway).  */
+  emit_use (stack_pointer_rtx);
+  DONE;
+})
+
+(define_expand "fixuns_truncdfdi2"
+  [(set (match_operand:DI 0 "register_operand")
+	(unsigned_fix:DI (match_operand:DF 1 "register_operand")))]
+  "TARGET_DOUBLE_FLOAT"
+{
+  rtx reg1 = gen_reg_rtx (DFmode);
+  rtx reg2 = gen_reg_rtx (DFmode);
+  rtx reg3 = gen_reg_rtx (DImode);
+  rtx_code_label *label1 = gen_label_rtx ();
+  rtx_code_label *label2 = gen_label_rtx ();
+  rtx test;
+  REAL_VALUE_TYPE offset;
+
+  real_2expN (&offset, 63, DFmode);
+
+  loongarch_emit_move (reg1, const_double_from_real_value (offset, DFmode));
+  do_pending_stack_adjust ();
+
+  test = gen_rtx_GE (VOIDmode, operands[1], reg1);
+  emit_jump_insn (gen_cbranchdf4 (test, operands[1], reg1, label1));
+
+  emit_insn (gen_fix_truncdfdi2 (operands[0], operands[1]));
+  emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (VOIDmode, label2)));
+  emit_barrier ();
+
+  emit_label (label1);
+  loongarch_emit_move (reg2, gen_rtx_MINUS (DFmode, operands[1], reg1));
+  loongarch_emit_move (reg3, GEN_INT (BITMASK_HIGH));
+  emit_insn (gen_ashldi3 (reg3, reg3, GEN_INT (32)));
+
+  emit_insn (gen_fix_truncdfdi2 (operands[0], reg2));
+  emit_insn (gen_iordi3 (operands[0], operands[0], reg3));
+
+  emit_label (label2);
+
+  /* Allow REG_NOTES to be set on last insn (labels don't have enough
+     fields, and can't be used for REG_NOTES anyway).  */
+  emit_use (stack_pointer_rtx);
+  DONE;
+})
+
+(define_expand "fixuns_truncsfsi2"
+  [(set (match_operand:SI 0 "register_operand")
+	(unsigned_fix:SI (match_operand:SF 1 "register_operand")))]
+  "TARGET_HARD_FLOAT"
+{
+  rtx reg1 = gen_reg_rtx (SFmode);
+  rtx reg2 = gen_reg_rtx (SFmode);
+  rtx reg3 = gen_reg_rtx (SImode);
+  rtx_code_label *label1 = gen_label_rtx ();
+  rtx_code_label *label2 = gen_label_rtx ();
+  rtx test;
+  REAL_VALUE_TYPE offset;
+
+  real_2expN (&offset, 31, SFmode);
+
+  loongarch_emit_move (reg1, const_double_from_real_value (offset, SFmode));
+  do_pending_stack_adjust ();
+
+  test = gen_rtx_GE (VOIDmode, operands[1], reg1);
+  emit_jump_insn (gen_cbranchsf4 (test, operands[1], reg1, label1));
+
+  emit_insn (gen_fix_truncsfsi2 (operands[0], operands[1]));
+  emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (VOIDmode, label2)));
+  emit_barrier ();
+
+  emit_label (label1);
+  loongarch_emit_move (reg2, gen_rtx_MINUS (SFmode, operands[1], reg1));
+  loongarch_emit_move (reg3, GEN_INT (trunc_int_for_mode
+				 (BITMASK_HIGH, SImode)));
+
+  emit_insn (gen_fix_truncsfsi2 (operands[0], reg2));
+  emit_insn (gen_iorsi3 (operands[0], operands[0], reg3));
+
+  emit_label (label2);
+
+  /* Allow REG_NOTES to be set on last insn (labels don't have enough
+     fields, and can't be used for REG_NOTES anyway).  */
+  emit_use (stack_pointer_rtx);
+  DONE;
+})
+
+(define_expand "fixuns_truncsfdi2"
+  [(set (match_operand:DI 0 "register_operand")
+	(unsigned_fix:DI (match_operand:SF 1 "register_operand")))]
+  "TARGET_DOUBLE_FLOAT"
+{
+  rtx reg1 = gen_reg_rtx (SFmode);
+  rtx reg2 = gen_reg_rtx (SFmode);
+  rtx reg3 = gen_reg_rtx (DImode);
+  rtx_code_label *label1 = gen_label_rtx ();
+  rtx_code_label *label2 = gen_label_rtx ();
+  rtx test;
+  REAL_VALUE_TYPE offset;
+
+  real_2expN (&offset, 63, SFmode);
+
+  loongarch_emit_move (reg1, const_double_from_real_value (offset, SFmode));
+  do_pending_stack_adjust ();
+
+  test = gen_rtx_GE (VOIDmode, operands[1], reg1);
+  emit_jump_insn (gen_cbranchsf4 (test, operands[1], reg1, label1));
+
+  emit_insn (gen_fix_truncsfdi2 (operands[0], operands[1]));
+  emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (VOIDmode, label2)));
+  emit_barrier ();
+
+  emit_label (label1);
+  loongarch_emit_move (reg2, gen_rtx_MINUS (SFmode, operands[1], reg1));
+  loongarch_emit_move (reg3, GEN_INT (BITMASK_HIGH));
+  emit_insn (gen_ashldi3 (reg3, reg3, GEN_INT (32)));
+
+  emit_insn (gen_fix_truncsfdi2 (operands[0], reg2));
+  emit_insn (gen_iordi3 (operands[0], operands[0], reg3));
+
+  emit_label (label2);
+
+  /* Allow REG_NOTES to be set on last insn (labels don't have enough
+     fields, and can't be used for REG_NOTES anyway).  */
+  emit_use (stack_pointer_rtx);
+  DONE;
+})
+
+;;
+;;  ....................
+;;
+;;	EXTRACT AND INSERT
+;;
+;;  ....................
+
+(define_expand "extzv<mode>"
+  [(set (match_operand:X 0 "register_operand")
+	(zero_extract:X (match_operand:X 1 "register_operand")
+			(match_operand 2 "const_int_operand")
+			(match_operand 3 "const_int_operand")))]
+  ""
+{
+  if (!loongarch_use_ins_ext_p (operands[1], INTVAL (operands[2]),
+				INTVAL (operands[3])))
+    FAIL;
+})
+
+(define_insn "*extzv<mode>"
+  [(set (match_operand:X 0 "register_operand" "=r")
+	(zero_extract:X (match_operand:X 1 "register_operand" "r")
+			  (match_operand 2 "const_int_operand" "")
+			  (match_operand 3 "const_int_operand" "")))]
+  "loongarch_use_ins_ext_p (operands[1], INTVAL (operands[2]),
+			    INTVAL (operands[3]))"
+{
+  operands[2] = GEN_INT (INTVAL (operands[2]) + INTVAL (operands[3]) - 1);
+  return "bstrpick.<d>\t%0,%1,%2,%3";
+}
+  [(set_attr "type" "arith")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "insv<mode>"
+  [(set (zero_extract:GPR (match_operand:GPR 0 "register_operand")
+			  (match_operand 1 "const_int_operand")
+			  (match_operand 2 "const_int_operand"))
+	(match_operand:GPR 3 "reg_or_0_operand"))]
+  ""
+{
+  if (!loongarch_use_ins_ext_p (operands[0], INTVAL (operands[1]),
+				INTVAL (operands[2])))
+    FAIL;
+})
+
+(define_insn "*insv<mode>"
+  [(set (zero_extract:GPR (match_operand:GPR 0 "register_operand" "+r")
+			  (match_operand:SI 1 "const_int_operand" "")
+			  (match_operand:SI 2 "const_int_operand" ""))
+	(match_operand:GPR 3 "reg_or_0_operand" "rJ"))]
+  "loongarch_use_ins_ext_p (operands[0], INTVAL (operands[1]),
+			    INTVAL (operands[2]))"
+{
+  operands[1] = GEN_INT (INTVAL (operands[1]) + INTVAL (operands[2]) - 1);
+  return "bstrins.<d>\t%0,%z3,%1,%2";
+}
+  [(set_attr "type" "arith")
+   (set_attr "mode" "<MODE>")])
+
+;;
+;;  ....................
+;;
+;;	DATA MOVEMENT
+;;
+;;  ....................
+
+;; 64-bit integer moves
+
+;; Unlike most other insns, the move insns can't be split with
+;; different predicates, because register spilling and other parts of
+;; the compiler, have memoized the insn number already.
+
+(define_expand "movdi"
+  [(set (match_operand:DI 0 "")
+	(match_operand:DI 1 ""))]
+  ""
+{
+  if (loongarch_legitimize_move (DImode, operands[0], operands[1]))
+    DONE;
+})
+
+(define_insn "*movdi_32bit"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,r,w,*f,*f,*r,*m")
+       (match_operand:DI 1 "move_operand" "r,i,w,r,*J*r,*m,*f,*f"))]
+  "!TARGET_64BIT
+   && (register_operand (operands[0], DImode)
+       || reg_or_0_operand (operands[1], DImode))"
+  { return loongarch_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "move,const,load,store,mgtf,fpload,mftg,fpstore")
+   (set_attr "mode" "DI")])
+
+(define_insn "*movdi_64bit"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,r,w,*f,*f,*r,*m")
+	(match_operand:DI 1 "move_operand" "r,Yd,w,rJ,*r*J,*m,*f,*f"))]
+  "TARGET_64BIT
+   && (register_operand (operands[0], DImode)
+       || reg_or_0_operand (operands[1], DImode))"
+  { return loongarch_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "move,const,load,store,mgtf,fpload,mftg,fpstore")
+   (set_attr "mode" "DI")])
+
+;; 32-bit Integer moves
+
+(define_expand "movsi"
+  [(set (match_operand:SI 0 "")
+	(match_operand:SI 1 ""))]
+  ""
+{
+  if (loongarch_legitimize_move (SImode, operands[0], operands[1]))
+    DONE;
+})
+
+(define_insn "*movsi_internal"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r,w,*f,*f,*r,*m,*r,*z")
+	(match_operand:SI 1 "move_operand" "r,Yd,w,rJ,*r*J,*m,*f,*f,*z,*r"))]
+  "(register_operand (operands[0], SImode)
+    || reg_or_0_operand (operands[1], SImode))"
+  { return loongarch_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "move,const,load,store,mgtf,fpload,mftg,fpstore,mftg,mgtf")
+   (set_attr "mode" "SI")])
+
+;; 16-bit Integer moves
+
+;; Unlike most other insns, the move insns can't be split with
+;; different predicates, because register spilling and other parts of
+;; the compiler, have memoized the insn number already.
+;; Unsigned loads are used because LOAD_EXTEND_OP returns ZERO_EXTEND.
+
+(define_expand "movhi"
+  [(set (match_operand:HI 0 "")
+	(match_operand:HI 1 ""))]
+  ""
+{
+  if (loongarch_legitimize_move (HImode, operands[0], operands[1]))
+    DONE;
+})
+
+(define_insn "*movhi_internal"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,r,m,r,k")
+	(match_operand:HI 1 "move_operand" "r,Yd,I,m,rJ,k,rJ"))]
+  "(register_operand (operands[0], HImode)
+       || reg_or_0_operand (operands[1], HImode))"
+  { return loongarch_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "move,const,const,load,store,load,store")
+   (set_attr "mode" "HI")])
+
+;; 8-bit Integer moves
+
+;; Unlike most other insns, the move insns can't be split with
+;; different predicates, because register spilling and other parts of
+;; the compiler, have memoized the insn number already.
+;; Unsigned loads are used because LOAD_EXTEND_OP returns ZERO_EXTEND.
+
+(define_expand "movqi"
+  [(set (match_operand:QI 0 "")
+	(match_operand:QI 1 ""))]
+  ""
+{
+  if (loongarch_legitimize_move (QImode, operands[0], operands[1]))
+    DONE;
+})
+
+(define_insn "*movqi_internal"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=r,r,r,m,r,k")
+	(match_operand:QI 1 "move_operand" "r,I,m,rJ,k,rJ"))]
+  "(register_operand (operands[0], QImode)
+       || reg_or_0_operand (operands[1], QImode))"
+  { return loongarch_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "move,const,load,store,load,store")
+   (set_attr "mode" "QI")])
+
+;; 32-bit floating point moves
+
+(define_expand "movsf"
+  [(set (match_operand:SF 0 "")
+	(match_operand:SF 1 ""))]
+  ""
+{
+  if (loongarch_legitimize_move (SFmode, operands[0], operands[1]))
+    DONE;
+})
+
+(define_insn "*movsf_hardfloat"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=f,f,f,m,f,k,m,*f,*r,*r,*r,*m")
+	(match_operand:SF 1 "move_operand" "f,G,m,f,k,f,G,*r,*f,*G*r,*m,*r"))]
+  "TARGET_HARD_FLOAT
+   && (register_operand (operands[0], SFmode)
+       || reg_or_0_operand (operands[1], SFmode))"
+  { return loongarch_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "fmove,mgtf,fpload,fpstore,fpload,fpstore,store,mgtf,mftg,move,load,store")
+   (set_attr "mode" "SF")])
+
+(define_insn "*movsf_softfloat"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,m")
+	(match_operand:SF 1 "move_operand" "Gr,m,r"))]
+  "TARGET_SOFT_FLOAT
+   && (register_operand (operands[0], SFmode)
+       || reg_or_0_operand (operands[1], SFmode))"
+  { return loongarch_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "move,load,store")
+   (set_attr "mode" "SF")])
+
+;; 64-bit floating point moves
+
+(define_expand "movdf"
+  [(set (match_operand:DF 0 "")
+	(match_operand:DF 1 ""))]
+  ""
+{
+  if (loongarch_legitimize_move (DFmode, operands[0], operands[1]))
+    DONE;
+})
+
+(define_insn "*movdf_hardfloat"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=f,f,f,m,f,k,m,*f,*r,*r,*r,*m")
+	(match_operand:DF 1 "move_operand" "f,G,m,f,k,f,G,*r,*f,*r*G,*m,*r"))]
+  "TARGET_DOUBLE_FLOAT
+   && (register_operand (operands[0], DFmode)
+       || reg_or_0_operand (operands[1], DFmode))"
+  { return loongarch_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "fmove,mgtf,fpload,fpstore,fpload,fpstore,store,mgtf,mftg,move,load,store")
+   (set_attr "mode" "DF")])
+
+(define_insn "*movdf_softfloat"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=r,r,m")
+	(match_operand:DF 1 "move_operand" "rG,m,rG"))]
+  "(TARGET_SOFT_FLOAT || TARGET_SINGLE_FLOAT)
+   && (register_operand (operands[0], DFmode)
+       || reg_or_0_operand (operands[1], DFmode))"
+  { return loongarch_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "move,load,store")
+   (set_attr "mode" "DF")])
+
+
+;; 128-bit integer moves
+
+(define_expand "movti"
+  [(set (match_operand:TI 0)
+	(match_operand:TI 1))]
+  "TARGET_64BIT"
+{
+  if (loongarch_legitimize_move (TImode, operands[0], operands[1]))
+    DONE;
+})
+
+(define_insn "*movti"
+  [(set (match_operand:TI 0 "nonimmediate_operand" "=r,r,r,m")
+	(match_operand:TI 1 "move_operand" "r,i,m,rJ"))]
+  "TARGET_64BIT
+   && (register_operand (operands[0], TImode)
+       || reg_or_0_operand (operands[1], TImode))"
+  { return loongarch_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "move,const,load,store")
+   (set (attr "mode")
+    (if_then_else (eq_attr "move_type" "imul")
+		      (const_string "SI")
+		      (const_string "TI")))])
+
+;; 128-bit floating point moves
+
+(define_expand "movtf"
+  [(set (match_operand:TF 0)
+	(match_operand:TF 1))]
+  "TARGET_64BIT"
+{
+  if (loongarch_legitimize_move (TFmode, operands[0], operands[1]))
+    DONE;
+})
+
+;; This pattern handles both hard- and soft-float cases.
+(define_insn "*movtf"
+  [(set (match_operand:TF 0 "nonimmediate_operand" "=r,r,m,f,r,f,m")
+	(match_operand:TF 1 "move_operand" "rG,m,rG,rG,f,m,f"))]
+  "TARGET_64BIT
+   && (register_operand (operands[0], TFmode)
+       || reg_or_0_operand (operands[1], TFmode))"
+  "#"
+  [(set_attr "move_type" "move,load,store,mgtf,mftg,fpload,fpstore")
+   (set_attr "mode" "TF")])
+
+(define_split
+  [(set (match_operand:MOVE64 0 "nonimmediate_operand")
+	(match_operand:MOVE64 1 "move_operand"))]
+  "reload_completed && loongarch_split_move_insn_p (operands[0], operands[1])"
+  [(const_int 0)]
+{
+  loongarch_split_move_insn (operands[0], operands[1], curr_insn);
+  DONE;
+})
+
+(define_split
+  [(set (match_operand:MOVE128 0 "nonimmediate_operand")
+	(match_operand:MOVE128 1 "move_operand"))]
+  "reload_completed && loongarch_split_move_insn_p (operands[0], operands[1])"
+  [(const_int 0)]
+{
+  loongarch_split_move_insn (operands[0], operands[1], curr_insn);
+  DONE;
+})
+
+;; Emit a doubleword move in which exactly one of the operands is
+;; a floating-point register.  We can't just emit two normal moves
+;; because of the constraints imposed by the FPU register model;
+;; see loongarch_can_change_mode_class for details.  Instead, we keep
+;; the FPR whole and use special patterns to refer to each word of
+;; the other operand.
+
+(define_expand "move_doubleword_fpr<mode>"
+  [(set (match_operand:SPLITF 0)
+	(match_operand:SPLITF 1))]
+  ""
+{
+  if (FP_REG_RTX_P (operands[0]))
+    {
+      rtx low = loongarch_subword (operands[1], 0);
+      rtx high = loongarch_subword (operands[1], 1);
+      emit_insn (gen_load_low<mode> (operands[0], low));
+      if (!TARGET_64BIT)
+       emit_insn (gen_movgr2frh<mode> (operands[0], high, operands[0]));
+      else
+       emit_insn (gen_load_high<mode> (operands[0], high, operands[0]));
+    }
+  else
+    {
+      rtx low = loongarch_subword (operands[0], 0);
+      rtx high = loongarch_subword (operands[0], 1);
+      emit_insn (gen_store_word<mode> (low, operands[1], const0_rtx));
+      if (!TARGET_64BIT)
+       emit_insn (gen_movfrh2gr<mode> (high, operands[1]));
+      else
+       emit_insn (gen_store_word<mode> (high, operands[1], const1_rtx));
+    }
+  DONE;
+})
+
+;; Clear one FCC register
+
+(define_insn "movfcc"
+  [(set (match_operand:FCC 0 "register_operand" "=z")
+	(const_int 0))]
+  ""
+  "movgr2cf\t%0,$r0")
+
+;; Conditional move instructions.
+
+(define_insn "*sel<code><GPR:mode>_using_<GPR2:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r")
+	(if_then_else:GPR
+	 (equality_op:GPR2 (match_operand:GPR2 1 "register_operand" "r,r")
+			   (const_int 0))
+	 (match_operand:GPR 2 "reg_or_0_operand" "r,J")
+	 (match_operand:GPR 3 "reg_or_0_operand" "J,r")))]
+  "register_operand (operands[2], <GPR:MODE>mode)
+   != register_operand (operands[3], <GPR:MODE>mode)"
+  "@
+   <sel>\t%0,%2,%1
+   <selinv>\t%0,%3,%1"
+  [(set_attr "type" "condmove")
+   (set_attr "mode" "<GPR:MODE>")])
+
+;; fsel copies the 3rd argument when the 1st is non-zero and the 2nd
+;; argument if the 1st is zero.  This means operand 2 and 3 are
+;; inverted in the instruction.
+
+(define_insn "*sel<mode>"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(if_then_else:ANYF
+	 (ne:FCC (match_operand:FCC 1 "register_operand" "z")
+		 (const_int 0))
+	 (match_operand:ANYF 2 "reg_or_0_operand" "f")
+	 (match_operand:ANYF 3 "reg_or_0_operand" "f")))]
+  ""
+  "fsel\t%0,%3,%2,%1"
+  [(set_attr "type" "condmove")
+   (set_attr "mode" "<ANYF:MODE>")])
+
+;; These are the main define_expand's used to make conditional moves.
+
+(define_expand "mov<mode>cc"
+  [(set (match_operand:GPR 0 "register_operand")
+	(if_then_else:GPR (match_operator 1 "comparison_operator"
+			 [(match_operand:GPR 2 "reg_or_0_operand")
+			  (match_operand:GPR 3 "reg_or_0_operand")])))]
+  "TARGET_COND_MOVE_INT"
+{
+  if (!INTEGRAL_MODE_P (GET_MODE (XEXP (operands[1], 0))))
+    FAIL;
+
+  loongarch_expand_conditional_move (operands);
+  DONE;
+})
+
+(define_expand "mov<mode>cc"
+  [(set (match_operand:ANYF 0 "register_operand")
+	(if_then_else:ANYF (match_operator 1 "comparison_operator"
+			  [(match_operand:ANYF 2 "reg_or_0_operand")
+			   (match_operand:ANYF 3 "reg_or_0_operand")])))]
+  "TARGET_COND_MOVE_FLOAT"
+{
+  if (!FLOAT_MODE_P (GET_MODE (XEXP (operands[1], 0))))
+    FAIL;
+
+  loongarch_expand_conditional_move (operands);
+  DONE;
+})
+
+(define_insn "lu32i_d"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(ior:DI
+	  (zero_extend:DI
+	    (subreg:SI (match_operand:DI 1 "register_operand" "0") 0))
+	  (match_operand:DI 2 "const_lu32i_operand" "u")))]
+  "TARGET_64BIT"
+  "lu32i.d\t%0,%X2>>32"
+  [(set_attr "type" "arith")
+   (set_attr "mode" "DI")])
+
+(define_insn "lu52i_d"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(ior:DI
+	  (and:DI (match_operand:DI 1 "register_operand" "r")
+		  (match_operand 2 "lu52i_mask_operand"))
+	  (match_operand 3 "const_lu52i_operand" "v")))]
+  "TARGET_64BIT"
+  "lu52i.d\t%0,%1,%X3>>52"
+  [(set_attr "type" "arith")
+   (set_attr "mode" "DI")])
+
+;; Convert floating-point numbers to integers
+(define_insn "frint_<fmt>"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")]
+		      UNSPEC_FRINT))]
+  ""
+  "frint.<fmt>\t%0,%1"
+  [(set_attr "type" "fcvt")
+   (set_attr "mode" "<MODE>")])
+
+;; Load the low word of operand 0 with operand 1.
+(define_insn "load_low<mode>"
+  [(set (match_operand:SPLITF 0 "register_operand" "=f,f")
+	(unspec:SPLITF [(match_operand:<HALFMODE> 1 "general_operand" "rJ,m")]
+		       UNSPEC_LOAD_LOW))]
+  "TARGET_HARD_FLOAT"
+{
+  operands[0] = loongarch_subword (operands[0], 0);
+  return loongarch_output_move (operands[0], operands[1]);
+}
+  [(set_attr "move_type" "mgtf,fpload")
+   (set_attr "mode" "<HALFMODE>")])
+
+;; Load the high word of operand 0 from operand 1, preserving the value
+;; in the low word.
+(define_insn "load_high<mode>"
+  [(set (match_operand:SPLITF 0 "register_operand" "=f,f")
+	(unspec:SPLITF [(match_operand:<HALFMODE> 1 "general_operand" "rJ,m")
+			(match_operand:SPLITF 2 "register_operand" "0,0")]
+		       UNSPEC_LOAD_HIGH))]
+  "TARGET_HARD_FLOAT"
+{
+  operands[0] = loongarch_subword (operands[0], 1);
+  return loongarch_output_move (operands[0], operands[1]);
+}
+  [(set_attr "move_type" "mgtf,fpload")
+   (set_attr "mode" "<HALFMODE>")])
+
+;; Store one word of operand 1 in operand 0.  Operand 2 is 1 to store the
+;; high word and 0 to store the low word.
+(define_insn "store_word<mode>"
+  [(set (match_operand:<HALFMODE> 0 "nonimmediate_operand" "=r,m")
+	(unspec:<HALFMODE> [(match_operand:SPLITF 1 "register_operand" "f,f")
+			    (match_operand 2 "const_int_operand")]
+			   UNSPEC_STORE_WORD))]
+  "TARGET_HARD_FLOAT"
+{
+  operands[1] = loongarch_subword (operands[1], INTVAL (operands[2]));
+  return loongarch_output_move (operands[0], operands[1]);
+}
+  [(set_attr "move_type" "mftg,fpstore")
+   (set_attr "mode" "<HALFMODE>")])
+
+;; Thread-Local Storage
+
+(define_insn "@got_load_tls_gd<mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec:P
+	    [(match_operand:P 1 "symbolic_operand" "")]
+	    UNSPEC_TLS_GD))]
+  ""
+  "la.tls.gd\t%0,%1"
+  [(set_attr "got" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "@got_load_tls_ld<mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec:P
+	    [(match_operand:P 1 "symbolic_operand" "")]
+	    UNSPEC_TLS_LD))]
+  ""
+  "la.tls.ld\t%0,%1"
+  [(set_attr "got" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "@got_load_tls_le<mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec:P
+	    [(match_operand:P 1 "symbolic_operand" "")]
+	    UNSPEC_TLS_LE))]
+  ""
+  "la.tls.le\t%0,%1"
+  [(set_attr "got" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "@got_load_tls_ie<mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec:P
+	    [(match_operand:P 1 "symbolic_operand" "")]
+	    UNSPEC_TLS_IE))]
+  ""
+  "la.tls.ie\t%0,%1"
+  [(set_attr "got" "load")
+   (set_attr "mode" "<MODE>")])
+
+;; Move operand 1 to the high word of operand 0 using movgr2frh.w, preserving the
+;; value in the low word.
+(define_insn "movgr2frh<mode>"
+  [(set (match_operand:SPLITF 0 "register_operand" "=f")
+	(unspec:SPLITF [(match_operand:<HALFMODE> 1 "reg_or_0_operand" "rJ")
+			(match_operand:SPLITF 2 "register_operand" "0")]
+			UNSPEC_MOVGR2FRH))]
+  "TARGET_DOUBLE_FLOAT"
+  "movgr2frh.w\t%z1,%0"
+  [(set_attr "move_type" "mgtf")
+   (set_attr "mode" "<HALFMODE>")])
+
+;; Move high word of operand 1 to operand 0 using movfrh2gr.s.
+(define_insn "movfrh2gr<mode>"
+  [(set (match_operand:<HALFMODE> 0 "register_operand" "=r")
+	(unspec:<HALFMODE> [(match_operand:SPLITF 1 "register_operand" "f")]
+			    UNSPEC_MOVFRH2GR))]
+  "TARGET_DOUBLE_FLOAT"
+  "movfrh2gr.s\t%0,%1"
+  [(set_attr "move_type" "mftg")
+   (set_attr "mode" "<HALFMODE>")])
+
+
+;; Expand in-line code to clear the instruction cache between operand[0] and
+;; operand[1].
+(define_expand "clear_cache"
+  [(match_operand 0 "pmode_register_operand")
+   (match_operand 1 "pmode_register_operand")]
+  ""
+{
+  emit_insn (gen_loongarch_ibar (const0_rtx));
+  DONE;
+})
+
+(define_insn "loongarch_ibar"
+  [(unspec_volatile:SI
+      [(match_operand 0 "const_uimm15_operand")]
+       UNSPECV_IBAR)
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "ibar\t%0")
+
+(define_insn "loongarch_dbar"
+  [(unspec_volatile:SI
+      [(match_operand 0 "const_uimm15_operand")]
+       UNSPECV_DBAR)
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "dbar\t%0")
+
+
+
+;; Privileged state instruction
+
+(define_insn "loongarch_cpucfg"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(match_operand:SI 1 "register_operand" "r")]
+			     UNSPECV_CPUCFG))]
+  ""
+  "cpucfg\t%0,%1"
+  [(set_attr "type" "load")
+   (set_attr "mode" "SI")])
+
+(define_insn "loongarch_syscall"
+  [(unspec_volatile:SI
+      [(match_operand 0 "const_uimm15_operand")]
+       UNSPECV_SYSCALL)
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "syscall\t%0")
+
+(define_insn "loongarch_break"
+  [(unspec_volatile:SI
+      [(match_operand 0 "const_uimm15_operand")]
+       UNSPECV_BREAK)
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "break\t%0")
+
+(define_insn "loongarch_asrtle_d"
+  [(unspec_volatile:DI [(match_operand:DI 0 "register_operand" "r")
+			(match_operand:DI 1 "register_operand" "r")]
+			UNSPECV_ASRTLE_D)]
+  "TARGET_64BIT"
+  "asrtle.d\t%0,%1"
+  [(set_attr "type" "load")
+   (set_attr "mode" "DI")])
+
+(define_insn "loongarch_asrtgt_d"
+  [(unspec_volatile:DI [(match_operand:DI 0 "register_operand" "r")
+			(match_operand:DI 1 "register_operand" "r")]
+			UNSPECV_ASRTGT_D)]
+  "TARGET_64BIT"
+  "asrtgt.d\t%0,%1"
+  [(set_attr "type" "load")
+   (set_attr "mode" "DI")])
+
+(define_insn "loongarch_csrrd_<d>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(unspec_volatile:GPR [(match_operand  1 "const_uimm14_operand")]
+			      UNSPECV_CSRRD))
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "csrrd\t%0,%1"
+  [(set_attr "type" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "loongarch_csrwr_<d>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	  (unspec_volatile:GPR
+	    [(match_operand:GPR 1 "register_operand" "0")
+	     (match_operand 2 "const_uimm14_operand")]
+	     UNSPECV_CSRWR))
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "csrwr\t%0,%2"
+  [(set_attr "type" "store")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "loongarch_csrxchg_<d>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	  (unspec_volatile:GPR
+	    [(match_operand:GPR 1 "register_operand" "0")
+	     (match_operand:GPR 2 "register_operand" "q")
+	     (match_operand 3 "const_uimm14_operand")]
+	     UNSPECV_CSRXCHG))
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "csrxchg\t%0,%2,%3"
+  [(set_attr "type" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "loongarch_iocsrrd_<size>"
+  [(set (match_operand:QHWD 0 "register_operand" "=r")
+	(unspec_volatile:QHWD [(match_operand:SI 1 "register_operand" "r")]
+			      UNSPECV_IOCSRRD))
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "iocsrrd.<size>\t%0,%1"
+  [(set_attr "type" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "loongarch_iocsrwr_<size>"
+  [(unspec_volatile:QHWD [(match_operand:QHWD 0 "register_operand" "r")
+			  (match_operand:SI 1 "register_operand" "r")]
+			  UNSPECV_IOCSRWR)
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "iocsrwr.<size>\t%0,%1"
+  [(set_attr "type" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "loongarch_cacop_<d>"
+  [(unspec_volatile:X [(match_operand 0 "const_uimm5_operand")
+		       (match_operand:X 1 "register_operand" "r")
+		       (match_operand 2 "const_imm12_operand")]
+		       UNSPECV_CACOP)
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "cacop\t%0,%1,%2"
+  [(set_attr "type" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "loongarch_lddir_<d>"
+  [(unspec_volatile:X [(match_operand:X 0 "register_operand" "r")
+		       (match_operand:X 1 "register_operand" "r")
+		       (match_operand 2 "const_uimm5_operand")]
+		       UNSPECV_LDDIR)
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "lddir\t%0,%1,%2"
+  [(set_attr "type" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "loongarch_ldpte_<d>"
+  [(unspec_volatile:X [(match_operand:X 0 "register_operand" "r")
+		       (match_operand 1 "const_uimm5_operand")]
+		       UNSPECV_LDPTE)
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "ldpte\t%0,%1"
+  [(set_attr "type" "load")
+   (set_attr "mode" "<MODE>")])
+
+
+;; Block moves, see loongarch.c for more details.
+;; Argument 0 is the destination.
+;; Argument 1 is the source.
+;; Argument 2 is the length.
+;; Argument 3 is the alignment.
+
+(define_expand "cpymemsi"
+  [(parallel [(set (match_operand:BLK 0 "general_operand")
+		   (match_operand:BLK 1 "general_operand"))
+	      (use (match_operand:SI 2 ""))
+	      (use (match_operand:SI 3 "const_int_operand"))])]
+  ""
+{
+  if (TARGET_DO_OPTIMIZE_BLOCK_MOVE_P
+      && loongarch_expand_block_move (operands[0], operands[1], operands[2]))
+    DONE;
+  else
+    FAIL;
+})
+
+;;
+;;  ....................
+;;
+;;	SHIFTS
+;;
+;;  ....................
+
+(define_insn "<optab><mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(any_shift:GPR (match_operand:GPR 1 "register_operand" "r")
+		       (match_operand:SI 2 "arith_operand" "rI")))]
+  ""
+{
+  if (CONST_INT_P (operands[2]))
+    operands[2] = GEN_INT (INTVAL (operands[2])
+			   & (GET_MODE_BITSIZE (<MODE>mode) - 1));
+
+  return "<insn>%i2.<d>\t%0,%1,%2";
+}
+  [(set_attr "type" "shift")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<optab>si3_extend"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(sign_extend:DI
+	   (any_shift:SI (match_operand:SI 1 "register_operand" "r")
+			 (match_operand:SI 2 "arith_operand" "rI"))))]
+  "TARGET_64BIT"
+{
+  if (CONST_INT_P (operands[2]))
+    operands[2] = GEN_INT (INTVAL (operands[2]) & 0x1f);
+
+  return "<insn>%i2.w\t%0,%1,%2";
+}
+  [(set_attr "type" "shift")
+   (set_attr "mode" "SI")])
+
+(define_insn "rotr<mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=r,r")
+	(rotatert:GPR (match_operand:GPR 1 "register_operand" "r,r")
+		      (match_operand:SI 2 "arith_operand" "r,I")))]
+  ""
+  "rotr%i2.<d>\t%0,%1,%2"
+  [(set_attr "type" "shift,shift")
+   (set_attr "mode" "<MODE>")])
+
+
+;; The following templates were added to generate "bstrpick.d + alsl.d"
+;; instruction pairs.
+;; It is required that the values of const_immalsl_operand and
+;; immediate_operand must have the following correspondence:
+;;
+;; (immediate_operand >> const_immalsl_operand) == 0xffffffff
+
+(define_insn "zero_extend_ashift"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(and:DI (ashift:DI (match_operand:DI 1 "register_operand" "r")
+			   (match_operand 2 "const_immalsl_operand" ""))
+		(match_operand 3 "immediate_operand" "")))]
+  "TARGET_64BIT
+   && ((INTVAL (operands[3]) >> INTVAL (operands[2])) == 0xffffffff)"
+  "bstrpick.d\t%0,%1,31,0\n\talsl.d\t%0,%0,$r0,%2"
+  [(set_attr "type" "arith")
+   (set_attr "mode" "DI")
+   (set_attr "insn_count" "2")])
+
+(define_insn "bstrpick_alsl_paired"
+  [(set (match_operand:DI 0 "register_operand" "=&r")
+	(plus:DI (match_operand:DI 1 "register_operand" "r")
+		 (and:DI (ashift:DI (match_operand:DI 2 "register_operand" "r")
+				    (match_operand 3 "const_immalsl_operand" ""))
+			 (match_operand 4 "immediate_operand" ""))))]
+  "TARGET_64BIT
+   && ((INTVAL (operands[4]) >> INTVAL (operands[3])) == 0xffffffff)"
+  "bstrpick.d\t%0,%2,31,0\n\talsl.d\t%0,%0,%1,%3"
+  [(set_attr "type" "arith")
+   (set_attr "mode" "DI")
+   (set_attr "insn_count" "2")])
+
+(define_insn "alsl<mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(plus:GPR (ashift:GPR (match_operand:GPR 1 "register_operand" "r")
+			      (match_operand 2 "const_immalsl_operand" ""))
+		  (match_operand:GPR 3 "register_operand" "r")))]
+  ""
+  "alsl.<d>\t%0,%1,%3,%2"
+  [(set_attr "type" "arith")
+   (set_attr "mode" "<MODE>")])
+
+
+
+;; Reverse the order of bytes of operand 1 and store the result in operand 0.
+
+(define_insn "bswaphi2"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(bswap:HI (match_operand:HI 1 "register_operand" "r")))]
+  ""
+  "revb.2h\t%0,%1"
+  [(set_attr "type" "shift")])
+
+(define_insn_and_split "bswapsi2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(bswap:SI (match_operand:SI 1 "register_operand" "r")))]
+  ""
+  "#"
+  ""
+  [(set (match_dup 0) (unspec:SI [(match_dup 1)] UNSPEC_REVB_2H))
+   (set (match_dup 0) (rotatert:SI (match_dup 0) (const_int 16)))]
+  ""
+  [(set_attr "insn_count" "2")])
+
+(define_insn_and_split "bswapdi2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(bswap:DI (match_operand:DI 1 "register_operand" "r")))]
+  "TARGET_64BIT"
+  "#"
+  ""
+  [(set (match_dup 0) (unspec:DI [(match_dup 1)] UNSPEC_REVB_4H))
+   (set (match_dup 0) (unspec:DI [(match_dup 0)] UNSPEC_REVH_D))]
+  ""
+  [(set_attr "insn_count" "2")])
+
+(define_insn "revb_2h"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")] UNSPEC_REVB_2H))]
+  ""
+  "revb.2h\t%0,%1"
+  [(set_attr "type" "shift")])
+
+(define_insn "revb_4h"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "r")] UNSPEC_REVB_4H))]
+  "TARGET_64BIT"
+  "revb.4h\t%0,%1"
+  [(set_attr "type" "shift")])
+
+(define_insn "revh_d"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "r")] UNSPEC_REVH_D))]
+  "TARGET_64BIT"
+  "revh.d\t%0,%1"
+  [(set_attr "type" "shift")])
+
+;;
+;;  ....................
+;;
+;;	CONDITIONAL BRANCHES
+;;
+;;  ....................
+
+;; Conditional branches
+
+(define_insn "*branch_fp_FCCmode"
+  [(set (pc)
+	(if_then_else
+	  (match_operator 1 "equality_operator"
+	      [(match_operand:FCC 2 "register_operand" "z")
+		(const_int 0)])
+	  (label_ref (match_operand 0 "" ""))
+	(pc)))]
+  "TARGET_HARD_FLOAT"
+{
+  return loongarch_output_conditional_branch (insn, operands,
+					      LARCH_BRANCH ("b%F1", "%Z2%0"),
+					      LARCH_BRANCH ("b%W1", "%Z2%0"));
+}
+  [(set_attr "type" "branch")])
+
+(define_insn "*branch_fp_inverted_FCCmode"
+  [(set (pc)
+	(if_then_else
+	  (match_operator 1 "equality_operator"
+	    [(match_operand:FCC 2 "register_operand" "z")
+	    (const_int 0)])
+	  (pc)
+	  (label_ref (match_operand 0 "" ""))))]
+  "TARGET_HARD_FLOAT"
+{
+  return loongarch_output_conditional_branch (insn, operands,
+					      LARCH_BRANCH ("b%W1", "%Z2%0"),
+					      LARCH_BRANCH ("b%F1", "%Z2%0"));
+}
+  [(set_attr "type" "branch")])
+
+;; Conditional branches on ordered comparisons with zero.
+
+(define_insn "*branch_order<mode>"
+  [(set (pc)
+	(if_then_else
+	 (match_operator 1 "order_operator"
+			 [(match_operand:X 2 "register_operand" "r,r")
+			  (match_operand:X 3 "reg_or_0_operand" "J,r")])
+	 (label_ref (match_operand 0 "" ""))
+	 (pc)))]
+  ""
+  { return loongarch_output_order_conditional_branch (insn, operands, false); }
+  [(set_attr "type" "branch")])
+
+(define_insn "*branch_order<mode>_inverted"
+  [(set (pc)
+	(if_then_else
+	 (match_operator 1 "order_operator"
+			 [(match_operand:X 2 "register_operand" "r,r")
+			  (match_operand:X 3 "reg_or_0_operand" "J,r")])
+	 (pc)
+	 (label_ref (match_operand 0 "" ""))))]
+  ""
+  { return loongarch_output_order_conditional_branch (insn, operands, true); }
+  [(set_attr "type" "branch")])
+
+;; Conditional branch on equality comparison.
+
+(define_insn "branch_equality<mode>"
+  [(set (pc)
+	(if_then_else
+	 (match_operator 1 "equality_operator"
+			 [(match_operand:X 2 "register_operand" "r")
+			  (match_operand:X 3 "reg_or_0_operand" "rJ")])
+	 (label_ref (match_operand 0 "" ""))
+	 (pc)))]
+  ""
+  { return loongarch_output_equal_conditional_branch (insn, operands, false); }
+  [(set_attr "type" "branch")])
+
+
+(define_insn "*branch_equality<mode>_inverted"
+  [(set (pc)
+	(if_then_else
+	 (match_operator 1 "equality_operator"
+			 [(match_operand:X 2 "register_operand" "r")
+			  (match_operand:X 3 "reg_or_0_operand" "rJ")])
+	 (pc)
+	 (label_ref (match_operand 0 "" ""))))]
+  ""
+  { return loongarch_output_equal_conditional_branch (insn, operands, true); }
+  [(set_attr "type" "branch")])
+
+
+(define_expand "cbranch<mode>4"
+  [(set (pc)
+	(if_then_else (match_operator 0 "comparison_operator"
+			[(match_operand:GPR 1 "register_operand")
+			 (match_operand:GPR 2 "nonmemory_operand")])
+		      (label_ref (match_operand 3 ""))
+		      (pc)))]
+  ""
+{
+  loongarch_expand_conditional_branch (operands);
+  DONE;
+})
+
+(define_expand "cbranch<mode>4"
+  [(set (pc)
+	(if_then_else (match_operator 0 "comparison_operator"
+			[(match_operand:ANYF 1 "register_operand")
+			 (match_operand:ANYF 2 "register_operand")])
+		      (label_ref (match_operand 3 ""))
+		      (pc)))]
+  ""
+{
+  loongarch_expand_conditional_branch (operands);
+  DONE;
+})
+
+;; Used to implement built-in functions.
+(define_expand "condjump"
+  [(set (pc)
+	(if_then_else (match_operand 0)
+		      (label_ref (match_operand 1))
+		      (pc)))])
+
+
+
+;;
+;;  ....................
+;;
+;;	SETTING A REGISTER FROM A COMPARISON
+;;
+;;  ....................
+
+;; Destination is always set in SI mode.
+
+(define_expand "cstore<mode>4"
+  [(set (match_operand:SI 0 "register_operand")
+	(match_operator:SI 1 "loongarch_cstore_operator"
+	 [(match_operand:GPR 2 "register_operand")
+	  (match_operand:GPR 3 "nonmemory_operand")]))]
+  ""
+{
+  loongarch_expand_scc (operands);
+  DONE;
+})
+
+(define_insn "*seq_zero_<X:mode><GPR:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(eq:GPR (match_operand:X 1 "register_operand" "r")
+		 (const_int 0)))]
+  ""
+  "sltui\t%0,%1,1"
+  [(set_attr "type" "slt")
+   (set_attr "mode" "<X:MODE>")])
+
+
+(define_insn "*sne_zero_<X:mode><GPR:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(ne:GPR (match_operand:X 1 "register_operand" "r")
+		 (const_int 0)))]
+  ""
+  "sltu\t%0,%.,%1"
+  [(set_attr "type" "slt")
+   (set_attr "mode" "<X:MODE>")])
+
+(define_insn "*sgt<u>_<X:mode><GPR:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(any_gt:GPR (match_operand:X 1 "register_operand" "r")
+		     (match_operand:X 2 "reg_or_0_operand" "rJ")))]
+  ""
+  "slt<u>\t%0,%z2,%1"
+  [(set_attr "type" "slt")
+   (set_attr "mode" "<X:MODE>")])
+
+(define_insn "*sge<u>_<X:mode><GPR:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(any_ge:GPR (match_operand:X 1 "register_operand" "r")
+		     (const_int 1)))]
+  ""
+  "slt<u>i\t%0,%.,%1"
+  [(set_attr "type" "slt")
+   (set_attr "mode" "<X:MODE>")])
+
+(define_insn "*slt<u>_<X:mode><GPR:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(any_lt:GPR (match_operand:X 1 "register_operand" "r")
+		     (match_operand:X 2 "arith_operand" "rI")))]
+  ""
+  "slt<u>%i2\t%0,%1,%2";
+  [(set_attr "type" "slt")
+   (set_attr "mode" "<X:MODE>")])
+
+(define_insn "*sle<u>_<X:mode><GPR:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(any_le:GPR (match_operand:X 1 "register_operand" "r")
+		    (match_operand:X 2 "sle_operand" "")))]
+  ""
+{
+  operands[2] = GEN_INT (INTVAL (operands[2]) + 1);
+  return "slt<u>i\t%0,%1,%2";
+}
+  [(set_attr "type" "slt")
+   (set_attr "mode" "<X:MODE>")])
+
+
+;;
+;;  ....................
+;;
+;;	FLOATING POINT COMPARISONS
+;;
+;;  ....................
+
+(define_insn "s<code>_<ANYF:mode>_using_FCCmode"
+  [(set (match_operand:FCC 0 "register_operand" "=z")
+	(fcond:FCC (match_operand:ANYF 1 "register_operand" "f")
+		   (match_operand:ANYF 2 "register_operand" "f")))]
+  ""
+  "fcmp.<fcond>.<fmt>\t%Z0%1,%2"
+  [(set_attr "type" "fcmp")
+   (set_attr "mode" "FCC")])
+
+
+;;
+;;  ....................
+;;
+;;	UNCONDITIONAL BRANCHES
+;;
+;;  ....................
+
+;; Unconditional branches.
+
+(define_expand "jump"
+  [(set (pc)
+	(label_ref (match_operand 0)))])
+
+(define_insn "*jump_absolute"
+  [(set (pc)
+	(label_ref (match_operand 0)))]
+  "!flag_pic"
+{
+  return "b\t%l0";
+}
+  [(set_attr "type" "branch")])
+
+(define_insn "*jump_pic"
+  [(set (pc)
+	(label_ref (match_operand 0)))]
+  "flag_pic"
+{
+  return "b\t%0";
+}
+  [(set_attr "type" "branch")])
+
+(define_expand "indirect_jump"
+  [(set (pc) (match_operand 0 "register_operand"))]
+  ""
+{
+  operands[0] = force_reg (Pmode, operands[0]);
+  emit_jump_insn (gen_indirect_jump (Pmode, operands[0]));
+  DONE;
+})
+
+(define_insn "@indirect_jump<mode>"
+  [(set (pc) (match_operand:P 0 "register_operand" "r"))]
+  ""
+  "jr\t%0"
+  [(set_attr "type" "jump")
+   (set_attr "mode" "none")])
+
+(define_expand "tablejump"
+  [(set (pc)
+	(match_operand 0 "register_operand"))
+   (use (label_ref (match_operand 1 "")))]
+  ""
+{
+  if (flag_pic)
+    operands[0] = expand_simple_binop (Pmode, PLUS, operands[0],
+				       gen_rtx_LABEL_REF (Pmode,
+							  operands[1]),
+				       NULL_RTX, 0, OPTAB_DIRECT);
+  emit_jump_insn (gen_tablejump (Pmode, operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "@tablejump<mode>"
+  [(set (pc)
+	(match_operand:P 0 "register_operand" "r"))
+   (use (label_ref (match_operand 1 "" "")))]
+  ""
+  "jr\t%0"
+  [(set_attr "type" "jump")
+   (set_attr "mode" "none")])
+
+
+
+;;
+;;  ....................
+;;
+;;	Function prologue/epilogue
+;;
+;;  ....................
+;;
+
+(define_expand "prologue"
+  [(const_int 1)]
+  ""
+{
+  loongarch_expand_prologue ();
+  DONE;
+})
+
+;; Block any insns from being moved before this point, since the
+;; profiling call to mcount can use various registers that aren't
+;; saved or used to pass arguments.
+
+(define_insn "blockage"
+  [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
+  ""
+  ""
+  [(set_attr "type" "ghost")
+   (set_attr "mode" "none")])
+
+(define_insn "@probe_stack_range<P:mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
+			    (match_operand:P 2 "register_operand" "r")
+			    (match_operand:P 3 "register_operand" "r")]
+			    UNSPECV_PROBE_STACK_RANGE))]
+  ""
+{
+  return loongarch_output_probe_stack_range (operands[0],
+					     operands[2],
+					     operands[3]);
+}
+  [(set_attr "type" "unknown")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "epilogue"
+  [(const_int 2)]
+  ""
+{
+  loongarch_expand_epilogue (false);
+  DONE;
+})
+
+(define_expand "sibcall_epilogue"
+  [(const_int 2)]
+  ""
+{
+  loongarch_expand_epilogue (true);
+  DONE;
+})
+
+;; Trivial return.  Make it look like a normal return insn as that
+;; allows jump optimizations to work better.
+
+(define_expand "return"
+  [(simple_return)]
+  "loongarch_can_use_return_insn ()"
+  { })
+
+(define_expand "simple_return"
+  [(simple_return)]
+  ""
+  { })
+
+(define_insn "*<optab>"
+  [(any_return)]
+  ""
+{
+  operands[0] = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
+  return "jr\t%0";
+}
+  [(set_attr "type" "jump")
+   (set_attr "mode" "none")])
+
+;; Normal return.
+
+(define_insn "<optab>_internal"
+  [(any_return)
+   (use (match_operand 0 "pmode_register_operand" ""))]
+  ""
+  "jr\t%0"
+  [(set_attr "type" "jump")
+   (set_attr "mode" "none")])
+
+;; Exception return.
+(define_insn "loongarch_ertn"
+  [(return)
+   (unspec_volatile [(const_int 0)] UNSPECV_ERTN)]
+  ""
+  "ertn"
+  [(set_attr "type" "trap")
+   (set_attr "mode" "none")])
+
+;; This is used in compiling the unwind routines.
+(define_expand "eh_return"
+  [(use (match_operand 0 "general_operand"))]
+  ""
+{
+  if (GET_MODE (operands[0]) != word_mode)
+    operands[0] = convert_to_mode (word_mode, operands[0], 0);
+  if (TARGET_64BIT)
+    emit_insn (gen_eh_set_ra_di (operands[0]));
+  else
+    emit_insn (gen_eh_set_ra_si (operands[0]));
+  DONE;
+})
+
+;; Clobber the return address on the stack.  We can't expand this
+;; until we know where it will be put in the stack frame.
+
+(define_insn "eh_set_ra_si"
+  [(unspec [(match_operand:SI 0 "register_operand" "r")] UNSPEC_EH_RETURN)
+   (clobber (match_scratch:SI 1 "=&r"))]
+  "! TARGET_64BIT"
+  "#")
+
+(define_insn "eh_set_ra_di"
+  [(unspec [(match_operand:DI 0 "register_operand" "r")] UNSPEC_EH_RETURN)
+   (clobber (match_scratch:DI 1 "=&r"))]
+  "TARGET_64BIT"
+  "#")
+
+(define_split
+  [(unspec [(match_operand 0 "register_operand")] UNSPEC_EH_RETURN)
+   (clobber (match_scratch 1))]
+  "reload_completed"
+  [(const_int 0)]
+{
+  loongarch_set_return_address (operands[0], operands[1]);
+  DONE;
+})
+
+
+
+;;
+;;  ....................
+;;
+;;	FUNCTION CALLS
+;;
+;;  ....................
+
+;; Sibling calls.  All these patterns use jump instructions.
+
+(define_expand "sibcall"
+  [(parallel [(call (match_operand 0 "")
+		    (match_operand 1 ""))
+	      (use (match_operand 2 ""))	;; next_arg_reg
+	      (use (match_operand 3 ""))])]	;; struct_value_size_rtx
+  ""
+{
+  rtx target = loongarch_legitimize_call_address (XEXP (operands[0], 0));
+
+  emit_call_insn (gen_sibcall_internal (target, operands[1]));
+  DONE;
+})
+
+(define_insn "sibcall_internal"
+  [(call (mem:SI (match_operand 0 "call_insn_operand" "j,c,a,t,h"))
+	 (match_operand 1 "" ""))]
+  "SIBLING_CALL_P (insn)"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "jr\t%0";
+    case 1:
+      if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r12,(%%pcrel(%0+0x20000))>>18\n\t"
+	       "jirl\t$r0,$r12,%%pcrel(%0+4)-(%%pcrel(%0+4+0x20000)>>18<<18)";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.local\t$r12,$r13,%0\n\tjr\t$r12";
+      else
+	return "b\t%0";
+    case 2:
+      if (TARGET_CMODEL_TINY_STATIC)
+	return "b\t%0";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r12,$r13,%0\n\tjr\t$r12";
+      else
+	return "la.global\t$r12,%0\n\tjr\t$r12";
+    case 3:
+      if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r12,$r13,%0\n\tjr\t$r12";
+      else
+	return "la.global\t$r12,%0\n\tjr\t$r12";
+    case 4:
+      if (TARGET_CMODEL_NORMAL || TARGET_CMODEL_TINY)
+	return "b\t%%plt(%0)";
+      else if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r12,(%%plt(%0)+0x20000)>>18\n\t"
+	       "jirl\t$r0,$r12,%%plt(%0)+4-((%%plt(%0)+(4+0x20000))>>18<<18)";
+      else
+	/* Cmodel extreme and tiny static not support plt.  */
+	gcc_unreachable ();
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "jirl" "indirect,direct,direct,direct,direct")])
+
+(define_expand "sibcall_value"
+  [(parallel [(set (match_operand 0 "")
+		   (call (match_operand 1 "")
+			 (match_operand 2 "")))
+	      (use (match_operand 3 ""))])]		;; next_arg_reg
+  ""
+{
+  rtx target = loongarch_legitimize_call_address (XEXP (operands[1], 0));
+
+ /*  Handle return values created by loongarch_pass_fpr_pair.  */
+  if (GET_CODE (operands[0]) == PARALLEL && XVECLEN (operands[0], 0) == 2)
+    {
+      rtx arg1 = XEXP (XVECEXP (operands[0],0, 0), 0);
+      rtx arg2 = XEXP (XVECEXP (operands[0],0, 1), 0);
+
+      emit_call_insn (gen_sibcall_value_multiple_internal (arg1, target,
+							   operands[2],
+							   arg2));
+    }
+   else
+    {
+      /*  Handle return values created by loongarch_return_fpr_single.  */
+      if (GET_CODE (operands[0]) == PARALLEL && XVECLEN (operands[0], 0) == 1)
+	operands[0] = XEXP (XVECEXP (operands[0], 0, 0), 0);
+
+      emit_call_insn (gen_sibcall_value_internal (operands[0], target,
+						  operands[2]));
+    }
+  DONE;
+})
+
+(define_insn "sibcall_value_internal"
+  [(set (match_operand 0 "register_operand" "")
+	(call (mem:SI (match_operand 1 "call_insn_operand" "j,c,a,t,h"))
+	      (match_operand 2 "" "")))]
+  "SIBLING_CALL_P (insn)"
+{
+  switch (which_alternative)
+  {
+    case 0:
+      return "jr\t%1";
+    case 1:
+      if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r12,%%pcrel(%1+0x20000)>>18\n\t"
+	       "jirl\t$r0,$r12,%%pcrel(%1+4)-((%%pcrel(%1+4+0x20000))>>18<<18)";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.local\t$r12,$r13,%1\n\tjr\t$r12";
+      else
+	return "b\t%1";
+    case 2:
+      if (TARGET_CMODEL_TINY_STATIC)
+	return "b\t%1";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r12,$r13,%1\n\tjr\t$r12";
+      else
+	return "la.global\t$r12,%1\n\tjr\t$r12";
+    case 3:
+      if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r12,$r13,%1\n\tjr\t$r12";
+      else
+	return "la.global\t$r12,%1\n\tjr\t$r12";
+    case 4:
+      if (TARGET_CMODEL_NORMAL || TARGET_CMODEL_TINY)
+	return " b\t%%plt(%1)";
+      else if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r12,(%%plt(%1)+0x20000)>>18\n\t"
+	       "jirl\t$r0,$r12,%%plt(%1)+4-((%%plt(%1)+(4+0x20000))>>18<<18)";
+      else
+	/* Cmodel extreme and tiny static not support plt.  */
+	gcc_unreachable ();
+    default:
+      gcc_unreachable ();
+  }
+}
+  [(set_attr "jirl" "indirect,direct,direct,direct,direct")])
+
+(define_insn "sibcall_value_multiple_internal"
+  [(set (match_operand 0 "register_operand" "")
+	(call (mem:SI (match_operand 1 "call_insn_operand" "j,c,a,t,h"))
+	      (match_operand 2 "" "")))
+   (set (match_operand 3 "register_operand" "")
+	(call (mem:SI (match_dup 1))
+	      (match_dup 2)))]
+  "SIBLING_CALL_P (insn)"
+{
+  switch (which_alternative)
+  {
+    case 0:
+      return "jr\t%1";
+    case 1:
+      if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r12,%%pcrel(%1+0x20000)>>18\n\t"
+	       "jirl\t$r0,$r12,%%pcrel(%1+4)-(%%pcrel(%1+4+0x20000)>>18<<18)";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.local\t$r12,$r13,%1\n\tjr\t$r12";
+      else
+	return "b\t%1";
+    case 2:
+      if (TARGET_CMODEL_TINY_STATIC)
+	return "b\t%1";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r12,$r13,%1\n\tjr\t$r12";
+      else
+	return "la.global\t$r12,%1\n\tjr\t$r12";
+    case 3:
+      if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r12,$r13,%1\n\tjr\t$r12";
+      else
+	return "la.global\t$r12,%1\n\tjr\t$r12";
+    case 4:
+      if (TARGET_CMODEL_NORMAL || TARGET_CMODEL_TINY)
+	return "b\t%%plt(%1)";
+      else if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r12,(%%plt(%1)+0x20000)>>18\n\t"
+	       "jirl\t$r0,$r12,%%plt(%1)+4-((%%plt(%1)+(4+0x20000))>>18<<18)";
+      else
+	/* Cmodel extreme and tiny static not support plt.  */
+	gcc_unreachable ();
+    default:
+      gcc_unreachable ();
+  }
+}
+  [(set_attr "jirl" "indirect,direct,direct,direct,direct")])
+
+(define_expand "call"
+  [(parallel [(call (match_operand 0 "")
+		    (match_operand 1 ""))
+	      (use (match_operand 2 ""))	;; next_arg_reg
+	      (use (match_operand 3 ""))])]	;; struct_value_size_rtx
+  ""
+{
+  rtx target = loongarch_legitimize_call_address (XEXP (operands[0], 0));
+
+  emit_call_insn (gen_call_internal (target, operands[1]));
+  DONE;
+})
+
+(define_insn "call_internal"
+  [(call (mem:SI (match_operand 0 "call_insn_operand" "e,c,a,t,h"))
+	 (match_operand 1 "" ""))
+   (clobber (reg:SI RETURN_ADDR_REGNUM))]
+  ""
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "jirl\t$r1,%0,0";
+    case 1:
+      if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r1,%%pcrel(%0+0x20000)>>18\n\t"
+	       "jirl\t$r1,$r1,%%pcrel(%0+4)-(%%pcrel(%0+4+0x20000)>>18<<18)";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.local\t$r1,$r12,%0\n\tjirl\t$r1,$r1,0";
+      else
+	return "bl\t%0";
+    case 2:
+      if (TARGET_CMODEL_TINY_STATIC)
+	return "bl\t%0";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r1,$r12,%0\n\tjirl\t$r1,$r1,0";
+      else
+	return "la.global\t$r1,%0\n\tjirl\t$r1,$r1,0";
+    case 3:
+      if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r1,$r12,%0\n\tjirl\t$r1,$r1,0";
+      else
+	return "la.global\t$r1,%0\n\tjirl\t$r1,$r1,0";
+    case 4:
+      if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r1,(%%plt(%0)+0x20000)>>18\n\t"
+	       "jirl\t$r1,$r1,%%plt(%0)+4-((%%plt(%0)+(4+0x20000))>>18<<18)";
+      else if (TARGET_CMODEL_NORMAL || TARGET_CMODEL_TINY)
+	return "bl\t%%plt(%0)";
+      else
+	/* Cmodel extreme and tiny static not support plt.  */
+	gcc_unreachable ();
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "jirl" "indirect,direct,direct,direct,direct")
+   (set_attr "insn_count" "1,2,3,3,2")])
+
+(define_expand "call_value"
+  [(parallel [(set (match_operand 0 "")
+		   (call (match_operand 1 "")
+			 (match_operand 2 "")))
+	      (use (match_operand 3 ""))])]		;; next_arg_reg
+  ""
+{
+  rtx target = loongarch_legitimize_call_address (XEXP (operands[1], 0));
+  /* Handle return values created by loongarch_pass_fpr_pair.  */
+  if (GET_CODE (operands[0]) == PARALLEL && XVECLEN (operands[0], 0) == 2)
+    {
+      rtx arg1 = XEXP (XVECEXP (operands[0], 0, 0), 0);
+      rtx arg2 = XEXP (XVECEXP (operands[0], 0, 1), 0);
+
+      emit_call_insn (gen_call_value_multiple_internal (arg1, target,
+							operands[2], arg2));
+    }
+   else
+    {
+      /* Handle return values created by loongarch_return_fpr_single.  */
+      if (GET_CODE (operands[0]) == PARALLEL && XVECLEN (operands[0], 0) == 1)
+	    operands[0] = XEXP (XVECEXP (operands[0], 0, 0), 0);
+
+      emit_call_insn (gen_call_value_internal (operands[0], target,
+					       operands[2]));
+    }
+  DONE;
+})
+
+(define_insn "call_value_internal"
+  [(set (match_operand 0 "register_operand" "")
+	(call (mem:SI (match_operand 1 "call_insn_operand" "e,c,a,t,h"))
+	      (match_operand 2 "" "")))
+   (clobber (reg:SI RETURN_ADDR_REGNUM))]
+  ""
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "jirl\t$r1,%1,0";
+    case 1:
+      if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r1,%%pcrel(%1+0x20000)>>18\n\t"
+	       "jirl\t$r1,$r1,%%pcrel(%1+4)-(%%pcrel(%1+4+0x20000)>>18<<18)";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.local\t$r1,$r12,%1\n\tjirl\t$r1,$r1,0";
+      else
+	return "bl\t%1";
+    case 2:
+      if (TARGET_CMODEL_TINY_STATIC)
+	return "bl\t%1";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r1,$r12,%1\n\tjirl\t$r1,$r1,0";
+      else
+	return "la.global\t$r1,%1\n\tjirl\t$r1,$r1,0";
+    case 3:
+      if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r1,$r12,%1\n\tjirl\t$r1,$r1,0";
+      else
+	return "la.global\t$r1,%1\n\tjirl\t$r1,$r1,0";
+    case 4:
+      if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r1,(%%plt(%1)+0x20000)>>18\n\t"
+	       "jirl\t$r1,$r1,%%plt(%1)+4-((%%plt(%1)+(4+0x20000))>>18<<18)";
+      else if (TARGET_CMODEL_NORMAL || TARGET_CMODEL_TINY)
+	return "bl\t%%plt(%1)";
+      else
+	/* Cmodel extreme and tiny static not support plt.  */
+	gcc_unreachable ();
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "jirl" "indirect,direct,direct,direct,direct")
+   (set_attr "insn_count" "1,2,3,3,2")])
+
+(define_insn "call_value_multiple_internal"
+  [(set (match_operand 0 "register_operand" "")
+	(call (mem:SI (match_operand 1 "call_insn_operand" "e,c,a,t,h"))
+	      (match_operand 2 "" "")))
+   (set (match_operand 3 "register_operand" "")
+	(call (mem:SI (match_dup 1))
+	      (match_dup 2)))
+   (clobber (reg:SI RETURN_ADDR_REGNUM))]
+  ""
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "jirl\t$r1,%1,0";
+    case 1:
+      if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r1,%%pcrel(%1+0x20000)>>18\n\t"
+	       "jirl\t$r1,$r1,%%pcrel(%1+4)-(%%pcrel(%1+4+0x20000)>>18<<18)";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.local\t$r1,$r12,%1\n\tjirl\t$r1,$r1,0";
+      else
+	return "bl\t%1";
+    case 2:
+      if (TARGET_CMODEL_TINY_STATIC)
+	return "bl\t%1";
+      else if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r1,$r12,%1\n\tjirl\t$r1,$r1,0 ";
+      else
+	return "la.global\t$r1,%1\n\tjirl\t$r1,$r1,0";
+    case 3:
+      if (TARGET_CMODEL_EXTREME)
+	return "la.global\t$r1,$r12,%1\n\tjirl\t$r1,$r1,0";
+      else
+	return "la.global\t$r1,%1\n\tjirl\t$r1,$r1,0";
+    case 4:
+      if (TARGET_CMODEL_LARGE)
+	return "pcaddu18i\t$r1,(%%plt(%1)+0x20000)>>18\n\t"
+	       "jirl\t$r1,$r1,%%plt(%1)+4-((%%plt(%1)+(4+0x20000))>>18<<18)";
+      else if (TARGET_CMODEL_NORMAL || TARGET_CMODEL_TINY)
+	return "bl\t%%plt(%1)";
+      else
+	/* Cmodel extreme and tiny static not support plt.  */
+	gcc_unreachable ();
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "jirl" "indirect,direct,direct,direct,direct")
+   (set_attr "insn_count" "1,2,3,3,2")])
+
+
+;; Call subroutine returning any type.
+(define_expand "untyped_call"
+  [(parallel [(call (match_operand 0 "")
+		    (const_int 0))
+	      (match_operand 1 "")
+	      (match_operand 2 "")])]
+  ""
+{
+  int i;
+
+  emit_call_insn (gen_call (operands[0], const0_rtx, NULL, const0_rtx));
+
+  for (i = 0; i < XVECLEN (operands[2], 0); i++)
+    {
+      rtx set = XVECEXP (operands[2], 0, i);
+      loongarch_emit_move (SET_DEST (set), SET_SRC (set));
+    }
+
+  emit_insn (gen_blockage ());
+  DONE;
+})
+
+;;
+;;  ....................
+;;
+;;	MISC.
+;;
+;;  ....................
+;;
+
+(define_insn "nop"
+  [(const_int 0)]
+  ""
+  "nop"
+  [(set_attr "type" "nop")
+   (set_attr "mode" "none")])
+
+;; __builtin_loongarch_movfcsr2gr: move the FCSR into operand 0.
+(define_insn "loongarch_movfcsr2gr"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec_volatile:SI [(match_operand 1 "const_uimm5_operand")]
+			     UNSPECV_MOVFCSR2GR))]
+  "TARGET_HARD_FLOAT"
+  "movfcsr2gr\t%0,$r%1")
+
+;; __builtin_loongarch_movgr2fcsr: move operand 0 into the FCSR.
+(define_insn "loongarch_movgr2fcsr"
+  [(unspec_volatile [(match_operand 0 "const_uimm5_operand")
+		     (match_operand:SI 1 "register_operand" "r")]
+		     UNSPECV_MOVGR2FCSR)]
+  "TARGET_HARD_FLOAT"
+  "movgr2fcsr\t$r%0,%1")
+
+(define_insn "fclass_<fmt>"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")]
+		      UNSPEC_FCLASS))]
+  "TARGET_HARD_FLOAT"
+  "fclass.<fmt>\t%0,%1"
+  [(set_attr "type" "unknown")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bytepick_w"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")
+		    (match_operand:SI 2 "register_operand" "r")
+		    (match_operand:SI 3 "const_0_to_3_operand" "n")]
+		    UNSPEC_BYTEPICK_W))]
+  ""
+  "bytepick.w\t%0,%1,%2,%z3"
+  [(set_attr "mode" "SI")])
+
+(define_insn "bytepick_d"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "r")
+		    (match_operand:DI 2 "register_operand" "r")
+		    (match_operand:DI 3 "const_0_to_7_operand" "n")]
+		    UNSPEC_BYTEPICK_D))]
+  ""
+  "bytepick.d\t%0,%1,%2,%z3"
+  [(set_attr "mode" "DI")])
+
+(define_insn "bitrev_4b"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:SI 1 "register_operand" "r")]
+		    UNSPEC_BITREV_4B))]
+  ""
+  "bitrev.4b\t%0,%1"
+  [(set_attr "type" "unknown")
+   (set_attr "mode" "SI")])
+
+(define_insn "bitrev_8b"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "r")]
+		    UNSPEC_BITREV_8B))]
+  ""
+  "bitrev.8b\t%0,%1"
+  [(set_attr "type" "unknown")
+   (set_attr "mode" "DI")])
+
+(define_insn "@stack_tie<mode>"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK [(match_operand:X 0 "register_operand" "r")
+		     (match_operand:X 1 "register_operand" "r")]
+		     UNSPEC_TIE))]
+  ""
+  ""
+  [(set_attr "length" "0")
+   (set_attr "type" "ghost")])
+
+
+(define_split
+  [(match_operand 0 "small_data_pattern")]
+  "reload_completed"
+  [(match_dup 0)]
+  { operands[0] = loongarch_rewrite_small_data (operands[0]); })
+
+
+;; Match paired HI/SI/SF/DFmode load/stores.
+(define_insn "*join2_load_store<JOIN_MODE:mode>"
+  [(set (match_operand:JOIN_MODE 0 "nonimmediate_operand"
+  "=&r,f,m,m,&r,ZC")
+	(match_operand:JOIN_MODE 1 "nonimmediate_operand" "m,m,r,f,ZC,r"))
+   (set (match_operand:JOIN_MODE 2 "nonimmediate_operand"
+   "=r,f,m,m,r,ZC")
+	(match_operand:JOIN_MODE 3 "nonimmediate_operand" "m,m,r,f,ZC,r"))]
+  "reload_completed"
+  {
+    /* The load destination does not overlap the source.  */
+    gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1]));
+    output_asm_insn (loongarch_output_move (operands[0], operands[1]),
+		     operands);
+    output_asm_insn (loongarch_output_move (operands[2], operands[3]),
+		     &operands[2]);
+    return "";
+  }
+  [(set_attr "move_type"
+  "load,fpload,store,fpstore,load,store")
+   (set_attr "insn_count" "2,2,2,2,2,2")])
+
+;; 2 HI/SI/SF/DF loads are bonded.
+(define_peephole2
+  [(set (match_operand:JOIN_MODE 0 "register_operand")
+	(match_operand:JOIN_MODE 1 "non_volatile_mem_operand"))
+   (set (match_operand:JOIN_MODE 2 "register_operand")
+	(match_operand:JOIN_MODE 3 "non_volatile_mem_operand"))]
+  "loongarch_load_store_bonding_p (operands, <JOIN_MODE:MODE>mode, true)"
+  [(parallel [(set (match_dup 0)
+		   (match_dup 1))
+	      (set (match_dup 2)
+		   (match_dup 3))])]
+  "")
+
+;; 2 HI/SI/SF/DF stores are bonded.
+(define_peephole2
+  [(set (match_operand:JOIN_MODE 0 "memory_operand")
+	(match_operand:JOIN_MODE 1 "register_operand"))
+   (set (match_operand:JOIN_MODE 2 "memory_operand")
+	(match_operand:JOIN_MODE 3 "register_operand"))]
+  "loongarch_load_store_bonding_p (operands, <JOIN_MODE:MODE>mode, false)"
+  [(parallel [(set (match_dup 0)
+		   (match_dup 1))
+	      (set (match_dup 2)
+		   (match_dup 3))])]
+  "")
+
+;; Match paired HImode loads.
+(define_insn "*join2_loadhi"
+  [(set (match_operand:SI 0 "register_operand" "=&r")
+	(any_extend:SI (match_operand:HI 1 "non_volatile_mem_operand" "m")))
+   (set (match_operand:SI 2 "register_operand" "=r")
+	(any_extend:SI (match_operand:HI 3 "non_volatile_mem_operand" "m")))]
+  "reload_completed"
+  {
+    /* The load destination does not overlap the source.  */
+    gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1]));
+    output_asm_insn ("ld.h<u>\t%0,%1", operands);
+    output_asm_insn ("ld.h<u>\t%2,%3", operands);
+
+    return "";
+  }
+  [(set_attr "move_type" "load")
+   (set_attr "insn_count" "2")])
+
+
+;; 2 HI loads are bonded.
+(define_peephole2
+  [(set (match_operand:SI 0 "register_operand")
+	(any_extend:SI (match_operand:HI 1 "non_volatile_mem_operand")))
+   (set (match_operand:SI 2 "register_operand")
+	(any_extend:SI (match_operand:HI 3 "non_volatile_mem_operand")))]
+  "loongarch_load_store_bonding_p (operands, HImode, true)"
+  [(parallel [(set (match_dup 0)
+		   (any_extend:SI (match_dup 1)))
+	      (set (match_dup 2)
+		   (any_extend:SI (match_dup 3)))])]
+  "")
+
+
+
+(define_mode_iterator QHSD [QI HI SI DI])
+
+(define_insn "loongarch_crc_w_<size>_w"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:QHSD 1 "register_operand" "r")
+		   (match_operand:SI 2 "register_operand" "r")]
+		     UNSPEC_CRC))]
+  ""
+  "crc.w.<size>.w\t%0,%1,%2"
+  [(set_attr "type" "unknown")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "loongarch_crcc_w_<size>_w"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:QHSD 1 "register_operand" "r")
+		   (match_operand:SI 2 "register_operand" "r")]
+		     UNSPEC_CRCC))]
+  ""
+  "crcc.w.<size>.w\t%0,%1,%2"
+  [(set_attr "type" "unknown")
+   (set_attr "mode" "<MODE>")])
+
+;; Synchronization instructions.
+
+(include "sync.md")
+
+(include "generic.md")
+(include "la464.md")
+
+(define_c_enum "unspec" [
+  UNSPEC_ADDRESS_FIRST
+])
diff --git a/gcc/config/loongarch/loongarch.opt b/gcc/config/loongarch/loongarch.opt
new file mode 100644
index 0000000..3ff0d86
--- /dev/null
+++ b/gcc/config/loongarch/loongarch.opt
@@ -0,0 +1,186 @@
+; Generated by "genstr" from the template "loongarch.opt.in"
+; and definitions from "loongarch-strings".
+;
+; Please do not edit this file directly.
+; It will be automatically updated during a gcc build
+; if you change "loongarch.opt.in" or "loongarch-strings".
+;
+; Copyright (C) 2021-2022 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT
+; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+; License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; <http://www.gnu.org/licenses/>.
+;
+
+; Variables (macros) that should be exported by loongarch.opt:
+;   la_opt_switches,
+;   la_opt_abi_base, la_opt_abi_ext,
+;   la_opt_cpu_arch, la_opt_cpu_tune,
+;   la_opt_fpu,
+;   la_cmodel.
+
+HeaderInclude
+config/loongarch/loongarch-opts.h
+
+HeaderInclude
+config/loongarch/loongarch-str.h
+
+Variable
+HOST_WIDE_INT la_opt_switches = 0
+
+; ISA related options
+;; Base ISA
+Enum
+Name(isa_base) Type(int)
+Basic ISAs of LoongArch:
+
+EnumValue
+Enum(isa_base) String(la64) Value(ISA_BASE_LA64V100)
+
+
+;; ISA extensions / adjustments
+Enum
+Name(isa_ext_fpu) Type(int)
+FPU types of LoongArch:
+
+EnumValue
+Enum(isa_ext_fpu) String(none) Value(ISA_EXT_NOFPU)
+
+EnumValue
+Enum(isa_ext_fpu) String(32) Value(ISA_EXT_FPU32)
+
+EnumValue
+Enum(isa_ext_fpu) String(64) Value(ISA_EXT_FPU64)
+
+mfpu=
+Target RejectNegative Joined ToLower Enum(isa_ext_fpu) Var(la_opt_fpu) Init(M_OPTION_NOT_SEEN)
+-mfpu=FPU	Generate code for the given FPU.
+
+mfpu=0
+Target RejectNegative Alias(mfpu=,none)
+
+msoft-float
+Target Driver RejectNegative Var(la_opt_switches) Mask(FORCE_SOFTF) Negative(msingle-float)
+Prevent the use of all hardware floating-point instructions.
+
+msingle-float
+Target Driver RejectNegative Var(la_opt_switches) Mask(FORCE_F32) Negative(mdouble-float)
+Restrict the use of hardware floating-point instructions to 32-bit operations.
+
+mdouble-float
+Target Driver RejectNegative Var(la_opt_switches) Mask(FORCE_F64) Negative(msoft-float)
+Allow hardware floating-point instructions to cover both 32-bit and 64-bit operations.
+
+
+;; Base target models (implies ISA & tune parameters)
+Enum
+Name(cpu_type) Type(int)
+LoongArch CPU types:
+
+EnumValue
+Enum(cpu_type) String(native) Value(CPU_NATIVE)
+
+EnumValue
+Enum(cpu_type) String(loongarch64) Value(CPU_LOONGARCH64)
+
+EnumValue
+Enum(cpu_type) String(la464) Value(CPU_LA464)
+
+march=
+Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_arch) Init(M_OPTION_NOT_SEEN)
+-march=PROCESSOR	Generate code for the given PROCESSOR ISA.
+
+mtune=
+Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_tune) Init(M_OPTION_NOT_SEEN)
+-mtune=PROCESSOR	Generate optimized code for PROCESSOR.
+
+
+; ABI related options
+; (ISA constraints on ABI are handled dynamically)
+
+;; Base ABI
+Enum
+Name(abi_base) Type(int)
+Base ABI types for LoongArch:
+
+EnumValue
+Enum(abi_base) String(lp64d) Value(ABI_BASE_LP64D)
+
+EnumValue
+Enum(abi_base) String(lp64f) Value(ABI_BASE_LP64F)
+
+EnumValue
+Enum(abi_base) String(lp64s) Value(ABI_BASE_LP64S)
+
+mabi=
+Target RejectNegative Joined ToLower Enum(abi_base) Var(la_opt_abi_base) Init(M_OPTION_NOT_SEEN)
+-mabi=BASEABI	Generate code that conforms to the given BASEABI.
+
+;; ABI Extension
+Variable
+int la_opt_abi_ext = M_OPTION_NOT_SEEN
+
+
+mbranch-cost=
+Target RejectNegative Joined UInteger Var(loongarch_branch_cost)
+-mbranch-cost=COST	Set the cost of branches to roughly COST instructions.
+
+mcheck-zero-division
+Target Mask(CHECK_ZERO_DIV)
+Trap on integer divide by zero.
+
+mcond-move-int
+Target Var(TARGET_COND_MOVE_INT) Init(1)
+Conditional moves for integral are enabled.
+
+mcond-move-float
+Target Var(TARGET_COND_MOVE_FLOAT) Init(1)
+Conditional moves for float are enabled.
+
+mmemcpy
+Target Mask(MEMCPY)
+Prevent optimizing block moves, which is also the default behavior of -Os.
+
+mstrict-align
+Target Var(TARGET_STRICT_ALIGN) Init(0)
+Do not generate unaligned memory accesses.
+
+mmax-inline-memcpy-size=
+Target Joined RejectNegative UInteger Var(loongarch_max_inline_memcpy_size) Init(1024)
+-mmax-inline-memcpy-size=SIZE	Set the max size of memcpy to inline, default is 1024.
+
+; The code model option names for -mcmodel.
+Enum
+Name(cmodel) Type(int)
+The code model option names for -mcmodel:
+
+EnumValue
+Enum(cmodel) String(normal) Value(CMODEL_NORMAL)
+
+EnumValue
+Enum(cmodel) String(tiny) Value(CMODEL_TINY)
+
+EnumValue
+Enum(cmodel) String(tiny-static) Value(CMODEL_TINY_STATIC)
+
+EnumValue
+Enum(cmodel) String(large) Value(CMODEL_LARGE)
+
+EnumValue
+Enum(cmodel) String(extreme) Value(CMODEL_EXTREME)
+
+mcmodel=
+Target RejectNegative Joined Enum(cmodel) Var(la_opt_cmodel) Init(CMODEL_NORMAL)
+Specify the code model.
diff --git a/gcc/config/loongarch/predicates.md b/gcc/config/loongarch/predicates.md
new file mode 100644
index 0000000..edd74d4
--- /dev/null
+++ b/gcc/config/loongarch/predicates.md
@@ -0,0 +1,253 @@
+;; Predicate definitions for LoongArch target.
+;; Copyright (C) 2021-2022 Free Software Foundation, Inc.
+;; Contributed by Loongson Ltd.
+;; Based on MIPS target for GNU compiler.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_predicate "const_uns_arith_operand"
+  (and (match_code "const_int")
+       (match_test "IMM12_OPERAND_UNSIGNED (INTVAL (op))")))
+
+(define_predicate "uns_arith_operand"
+  (ior (match_operand 0 "const_uns_arith_operand")
+       (match_operand 0 "register_operand")))
+
+(define_predicate "const_lu32i_operand"
+  (and (match_code "const_int")
+       (match_test "LU32I_OPERAND (INTVAL (op))")))
+
+(define_predicate "const_lu52i_operand"
+  (and (match_code "const_int")
+       (match_test "LU52I_OPERAND (INTVAL (op))")))
+
+(define_predicate "const_arith_operand"
+  (and (match_code "const_int")
+       (match_test "IMM12_OPERAND (INTVAL (op))")))
+
+(define_predicate "const_imm16_operand"
+  (and (match_code "const_int")
+       (match_test "IMM16_OPERAND (INTVAL (op))")))
+
+(define_predicate "arith_operand"
+  (ior (match_operand 0 "const_arith_operand")
+       (match_operand 0 "register_operand")))
+
+(define_predicate "const_immalsl_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 1, 4)")))
+
+(define_predicate "const_uimm5_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
+
+(define_predicate "const_uimm14_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 16383)")))
+
+(define_predicate "const_uimm15_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 32767)")))
+
+(define_predicate "const_imm12_operand"
+  (and (match_code "const_int")
+       (match_test "IMM12_OPERAND (INTVAL (op))")))
+
+(define_predicate "sle_operand"
+  (and (match_code "const_int")
+       (match_test "IMM12_OPERAND (INTVAL (op) + 1)")))
+
+(define_predicate "sleu_operand"
+  (and (match_operand 0 "sle_operand")
+       (match_test "INTVAL (op) + 1 != 0")))
+
+(define_predicate "const_0_operand"
+  (and (match_code "const_int,const_double,const_vector")
+       (match_test "op == CONST0_RTX (GET_MODE (op))")))
+
+(define_predicate "reg_or_0_operand"
+  (ior (match_operand 0 "const_0_operand")
+       (match_operand 0 "register_operand")))
+
+(define_predicate "const_1_operand"
+  (and (match_code "const_int,const_double,const_vector")
+       (match_test "op == CONST1_RTX (GET_MODE (op))")))
+
+(define_predicate "reg_or_1_operand"
+  (ior (match_operand 0 "const_1_operand")
+       (match_operand 0 "register_operand")))
+
+(define_predicate "const_0_to_3_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 3)")))
+
+(define_predicate "const_0_to_7_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 7)")))
+
+(define_predicate "lu52i_mask_operand"
+  (and (match_code "const_int")
+       (match_test "UINTVAL (op) == 0xfffffffffffff")))
+
+(define_predicate "low_bitmask_operand"
+  (and (match_code "const_int")
+       (match_test "low_bitmask_len (mode, INTVAL (op)) > 12")))
+
+(define_predicate "const_call_insn_operand"
+  (match_code "const,symbol_ref,label_ref")
+{
+  enum loongarch_symbol_type symbol_type;
+
+  if (!loongarch_symbolic_constant_p (op, &symbol_type))
+    return false;
+
+  switch (symbol_type)
+    {
+    case SYMBOL_GOT_DISP:
+      /* Without explicit relocs, there is no special syntax for
+	 loading the address of a call destination into a register.
+	 Using "la.global JIRL_REGS,foo; jirl JIRL_REGS" would prevent the lazy
+	 binding of "foo", so keep the address of global symbols with the jirl
+	 macro.  */
+      return 1;
+
+    default:
+      return false;
+    }
+})
+
+(define_predicate "call_insn_operand"
+  (ior (match_operand 0 "const_call_insn_operand")
+       (match_operand 0 "register_operand")))
+
+(define_predicate "is_const_call_local_symbol"
+  (and (match_operand 0 "const_call_insn_operand")
+       (ior (match_test "loongarch_global_symbol_p (op) == 0")
+	    (match_test "loongarch_symbol_binds_local_p (op) != 0"))
+       (match_test "CONSTANT_P (op)")))
+
+(define_predicate "is_const_call_weak_symbol"
+  (and (match_operand 0 "const_call_insn_operand")
+       (not (match_operand 0 "is_const_call_local_symbol"))
+       (match_test "loongarch_weak_symbol_p (op) != 0")
+       (match_test "CONSTANT_P (op)")))
+
+(define_predicate "is_const_call_plt_symbol"
+  (and (match_operand 0 "const_call_insn_operand")
+       (match_test "flag_plt != 0")
+       (match_test "loongarch_global_symbol_noweak_p (op) != 0")
+       (match_test "CONSTANT_P (op)")))
+
+(define_predicate "is_const_call_global_noplt_symbol"
+  (and (match_operand 0 "const_call_insn_operand")
+       (match_test "flag_plt == 0")
+       (match_test "loongarch_global_symbol_noweak_p (op) != 0")
+       (match_test "CONSTANT_P (op)")))
+
+;; A legitimate CONST_INT operand that takes more than one instruction
+;; to load.
+(define_predicate "splittable_const_int_operand"
+  (match_code "const_int")
+{
+  /* Don't handle multi-word moves this way; we don't want to introduce
+     the individual word-mode moves until after reload.  */
+  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+    return false;
+
+  /* Otherwise check whether the constant can be loaded in a single
+     instruction.  */
+  return !LU12I_INT (op) && !IMM12_INT (op) && !IMM12_INT_UNSIGNED (op)
+	 && !LU52I_INT (op);
+})
+
+(define_predicate "move_operand"
+  (match_operand 0 "general_operand")
+{
+  enum loongarch_symbol_type symbol_type;
+
+  /* The thinking here is as follows:
+
+     (1) The move expanders should split complex load sequences into
+	 individual instructions.  Those individual instructions can
+	 then be optimized by all rtl passes.
+
+     (2) The target of pre-reload load sequences should not be used
+	 to store temporary results.  If the target register is only
+	 assigned one value, reload can rematerialize that value
+	 on demand, rather than spill it to the stack.
+
+     (3) If we allowed pre-reload passes like combine and cse to recreate
+	 complex load sequences, we would want to be able to split the
+	 sequences before reload as well, so that the pre-reload scheduler
+	 can see the individual instructions.  This falls foul of (2);
+	 the splitter would be forced to reuse the target register for
+	 intermediate results.
+
+     (4) We want to define complex load splitters for combine.  These
+	 splitters can request a temporary scratch register, which avoids
+	 the problem in (2).  They allow things like:
+
+	      (set (reg T1) (high SYM))
+	      (set (reg T2) (low (reg T1) SYM))
+	      (set (reg X) (plus (reg T2) (const_int OFFSET)))
+
+	 to be combined into:
+
+	      (set (reg T3) (high SYM+OFFSET))
+	      (set (reg X) (lo_sum (reg T3) SYM+OFFSET))
+
+	 if T2 is only used this once.  */
+  switch (GET_CODE (op))
+    {
+    case CONST_INT:
+      return !splittable_const_int_operand (op, mode);
+
+    case CONST:
+    case SYMBOL_REF:
+    case LABEL_REF:
+      return (loongarch_symbolic_constant_p (op, &symbol_type));
+    default:
+      return true;
+    }
+})
+
+(define_predicate "symbolic_operand"
+  (match_code "const,symbol_ref,label_ref")
+{
+  enum loongarch_symbol_type type;
+  return loongarch_symbolic_constant_p (op, &type);
+})
+
+(define_predicate "equality_operator"
+  (match_code "eq,ne"))
+
+(define_predicate "order_operator"
+  (match_code "lt,ltu,le,leu,ge,geu,gt,gtu"))
+
+;; For NE, cstore uses sltu instructions in which the first operand is $0.
+
+(define_predicate "loongarch_cstore_operator"
+  (match_code "ne,eq,gt,gtu,ge,geu,lt,ltu,le,leu"))
+
+(define_predicate "small_data_pattern"
+  (and (match_code "set,parallel,unspec,unspec_volatile,prefetch")
+       (match_test "loongarch_small_data_pattern_p (op)")))
+
+;; Return 1 if the operand is in non-volatile memory.
+(define_predicate "non_volatile_mem_operand"
+  (and (match_operand 0 "memory_operand")
+       (not (match_test "MEM_VOLATILE_P (op)"))))
diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md
new file mode 100644
index 0000000..0c4f198
--- /dev/null
+++ b/gcc/config/loongarch/sync.md
@@ -0,0 +1,574 @@
+;; Machine description for LoongArch atomic operations.
+;; Copyright (C) 2021-2022 Free Software Foundation, Inc.
+;; Contributed by Loongson Ltd.
+;; Based on MIPS and RISC-V target for GNU compiler.
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_c_enum "unspec" [
+  UNSPEC_COMPARE_AND_SWAP
+  UNSPEC_COMPARE_AND_SWAP_ADD
+  UNSPEC_COMPARE_AND_SWAP_SUB
+  UNSPEC_COMPARE_AND_SWAP_AND
+  UNSPEC_COMPARE_AND_SWAP_XOR
+  UNSPEC_COMPARE_AND_SWAP_OR
+  UNSPEC_COMPARE_AND_SWAP_NAND
+  UNSPEC_SYNC_OLD_OP
+  UNSPEC_SYNC_EXCHANGE
+  UNSPEC_ATOMIC_STORE
+  UNSPEC_MEMORY_BARRIER
+])
+
+(define_code_iterator any_atomic [plus ior xor and])
+(define_code_attr atomic_optab
+  [(plus "add") (ior "or") (xor "xor") (and "and")])
+
+;; This attribute gives the format suffix for atomic memory operations.
+(define_mode_attr amo [(SI "w") (DI "d")])
+
+;; <amop> expands to the name of the atomic operand that implements a
+;; particular code.
+(define_code_attr amop [(ior "or") (xor "xor") (and "and") (plus "add")])
+
+;; Memory barriers.
+
+(define_expand "mem_thread_fence"
+  [(match_operand:SI 0 "const_int_operand" "")] ;; model
+  ""
+{
+  if (INTVAL (operands[0]) != MEMMODEL_RELAXED)
+    {
+      rtx mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+      MEM_VOLATILE_P (mem) = 1;
+      emit_insn (gen_mem_thread_fence_1 (mem, operands[0]));
+    }
+  DONE;
+})
+
+;; Until the LoongArch memory model (hence its mapping from C++) is finalized,
+;; conservatively emit a full FENCE.
+(define_insn "mem_thread_fence_1"
+  [(set (match_operand:BLK 0 "" "")
+	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))
+   (match_operand:SI 1 "const_int_operand" "")] ;; model
+  ""
+  "dbar\t0")
+
+;; Atomic memory operations.
+
+;; Implement atomic stores with amoswap.  Fall back to fences for atomic loads.
+(define_insn "atomic_store<mode>"
+  [(set (match_operand:GPR 0 "memory_operand" "+ZB")
+    (unspec_volatile:GPR
+      [(match_operand:GPR 1 "reg_or_0_operand" "rJ")
+       (match_operand:SI 2 "const_int_operand")]      ;; model
+      UNSPEC_ATOMIC_STORE))]
+  ""
+  "amswap%A2.<amo>\t$zero,%z1,%0"
+  [(set (attr "length") (const_int 8))])
+
+(define_insn "atomic_<atomic_optab><mode>"
+  [(set (match_operand:GPR 0 "memory_operand" "+ZB")
+	(unspec_volatile:GPR
+	  [(any_atomic:GPR (match_dup 0)
+			   (match_operand:GPR 1 "reg_or_0_operand" "rJ"))
+	   (match_operand:SI 2 "const_int_operand")] ;; model
+	 UNSPEC_SYNC_OLD_OP))]
+  ""
+  "am<amop>%A2.<amo>\t$zero,%z1,%0"
+  [(set (attr "length") (const_int 8))])
+
+(define_insn "atomic_fetch_<atomic_optab><mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")
+	(match_operand:GPR 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR
+	  [(any_atomic:GPR (match_dup 1)
+		     (match_operand:GPR 2 "reg_or_0_operand" "rJ"))
+	   (match_operand:SI 3 "const_int_operand")] ;; model
+	 UNSPEC_SYNC_OLD_OP))]
+  ""
+  "am<amop>%A3.<amo>\t%0,%z2,%1"
+  [(set (attr "length") (const_int 8))])
+
+(define_insn "atomic_exchange<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")
+	(unspec_volatile:GPR
+	  [(match_operand:GPR 1 "memory_operand" "+ZB")
+	   (match_operand:SI 3 "const_int_operand")] ;; model
+	  UNSPEC_SYNC_EXCHANGE))
+   (set (match_dup 1)
+	(match_operand:GPR 2 "register_operand" "r"))]
+  ""
+  "amswap%A3.<amo>\t%0,%z2,%1"
+  [(set (attr "length") (const_int 8))])
+
+(define_insn "atomic_cas_value_strong<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")
+			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")
+			      (match_operand:SI 4 "const_int_operand")  ;; mod_s
+			      (match_operand:SI 5 "const_int_operand")] ;; mod_f
+	 UNSPEC_COMPARE_AND_SWAP))
+   (clobber (match_scratch:GPR 6 "=&r"))]
+  ""
+{
+  return "%G5\\n\\t"
+	 "1:\\n\\t"
+	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "bne\\t%0,%z2,2f\\n\\t"
+	 "or%i3\\t%6,$zero,%3\\n\\t"
+	 "sc.<amo>\\t%6,%1\\n\\t"
+	 "beq\\t$zero,%6,1b\\n\\t"
+	 "b\\t3f\\n\\t"
+	 "2:\\n\\t"
+	 "dbar\\t0x700\\n\\t"
+	 "3:\\n\\t";
+}
+  [(set (attr "length") (const_int 32))])
+
+(define_expand "atomic_compare_and_swap<mode>"
+  [(match_operand:SI 0 "register_operand" "")   ;; bool output
+   (match_operand:GPR 1 "register_operand" "")  ;; val output
+   (match_operand:GPR 2 "memory_operand" "")    ;; memory
+   (match_operand:GPR 3 "reg_or_0_operand" "")  ;; expected value
+   (match_operand:GPR 4 "reg_or_0_operand" "")  ;; desired value
+   (match_operand:SI 5 "const_int_operand" "")  ;; is_weak
+   (match_operand:SI 6 "const_int_operand" "")  ;; mod_s
+   (match_operand:SI 7 "const_int_operand" "")] ;; mod_f
+  ""
+{
+  emit_insn (gen_atomic_cas_value_strong<mode> (operands[1], operands[2],
+						operands[3], operands[4],
+						operands[6], operands[7]));
+
+  rtx compare = operands[1];
+  if (operands[3] != const0_rtx)
+    {
+      rtx difference = gen_rtx_MINUS (<MODE>mode, operands[1], operands[3]);
+      compare = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_rtx_SET (compare, difference));
+    }
+
+  if (word_mode != <MODE>mode)
+    {
+      rtx reg = gen_reg_rtx (word_mode);
+      emit_insn (gen_rtx_SET (reg, gen_rtx_SIGN_EXTEND (word_mode, compare)));
+      compare = reg;
+    }
+
+  emit_insn (gen_rtx_SET (operands[0],
+			  gen_rtx_EQ (SImode, compare, const0_rtx)));
+  DONE;
+})
+
+(define_expand "atomic_test_and_set"
+  [(match_operand:QI 0 "register_operand" "")     ;; bool output
+   (match_operand:QI 1 "memory_operand" "+ZB")    ;; memory
+   (match_operand:SI 2 "const_int_operand" "")]   ;; model
+  ""
+{
+  /* We have no QImode atomics, so use the address LSBs to form a mask,
+     then use an aligned SImode atomic.  */
+  rtx result = operands[0];
+  rtx mem = operands[1];
+  rtx model = operands[2];
+  rtx addr = force_reg (Pmode, XEXP (mem, 0));
+  rtx tmp_reg = gen_reg_rtx (Pmode);
+  rtx zero_reg = gen_rtx_REG (Pmode, 0);
+
+  rtx aligned_addr = gen_reg_rtx (Pmode);
+  emit_move_insn (tmp_reg, gen_rtx_PLUS (Pmode, zero_reg, GEN_INT (-4)));
+  emit_move_insn (aligned_addr, gen_rtx_AND (Pmode, addr, tmp_reg));
+
+  rtx aligned_mem = change_address (mem, SImode, aligned_addr);
+  set_mem_alias_set (aligned_mem, 0);
+
+  rtx offset = gen_reg_rtx (SImode);
+  emit_move_insn (offset, gen_rtx_AND (SImode, gen_lowpart (SImode, addr),
+				       GEN_INT (3)));
+
+  rtx tmp = gen_reg_rtx (SImode);
+  emit_move_insn (tmp, GEN_INT (1));
+
+  rtx shmt = gen_reg_rtx (SImode);
+  emit_move_insn (shmt, gen_rtx_ASHIFT (SImode, offset, GEN_INT (3)));
+
+  rtx word = gen_reg_rtx (SImode);
+  emit_move_insn (word, gen_rtx_ASHIFT (SImode, tmp, shmt));
+
+  tmp = gen_reg_rtx (SImode);
+  emit_insn (gen_atomic_fetch_orsi (tmp, aligned_mem, word, model));
+
+  emit_move_insn (gen_lowpart (SImode, result),
+		  gen_rtx_LSHIFTRT (SImode, tmp, shmt));
+  DONE;
+})
+
+(define_insn "atomic_cas_value_cmp_and_7_<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")
+			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")
+			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")
+			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")
+			      (match_operand:SI 6 "const_int_operand")] ;; model
+	 UNSPEC_COMPARE_AND_SWAP))
+   (clobber (match_scratch:GPR 7 "=&r"))]
+  ""
+{
+  return "%G6\\n\\t"
+	 "1:\\n\\t"
+	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "and\\t%7,%0,%2\\n\\t"
+	 "bne\\t%7,%z4,2f\\n\\t"
+	 "and\\t%7,%0,%z3\\n\\t"
+	 "or%i5\\t%7,%7,%5\\n\\t"
+	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "beq\\t$zero,%7,1b\\n\\t"
+	 "b\\t3f\\n\\t"
+	 "2:\\n\\t"
+	 "dbar\\t0x700\\n\\t"
+	 "3:\\n\\t";
+}
+  [(set (attr "length") (const_int 40))])
+
+(define_expand "atomic_compare_and_swap<mode>"
+  [(match_operand:SI 0 "register_operand" "")   ;; bool output
+   (match_operand:SHORT 1 "register_operand" "")  ;; val output
+   (match_operand:SHORT 2 "memory_operand" "")    ;; memory
+   (match_operand:SHORT 3 "reg_or_0_operand" "")  ;; expected value
+   (match_operand:SHORT 4 "reg_or_0_operand" "")  ;; desired value
+   (match_operand:SI 5 "const_int_operand" "")  ;; is_weak
+   (match_operand:SI 6 "const_int_operand" "")  ;; mod_s
+   (match_operand:SI 7 "const_int_operand" "")] ;; mod_f
+  ""
+{
+  union loongarch_gen_fn_ptrs generator;
+  generator.fn_7 = gen_atomic_cas_value_cmp_and_7_si;
+  loongarch_expand_atomic_qihi (generator, operands[1], operands[2],
+				operands[3], operands[4], operands[7]);
+
+  rtx compare = operands[1];
+  if (operands[3] != const0_rtx)
+    {
+      machine_mode mode = GET_MODE (operands[3]);
+      rtx op1 = convert_modes (SImode, mode, operands[1], true);
+      rtx op3 = convert_modes (SImode, mode, operands[3], true);
+      rtx difference = gen_rtx_MINUS (SImode, op1, op3);
+      compare = gen_reg_rtx (SImode);
+      emit_insn (gen_rtx_SET (compare, difference));
+    }
+
+  if (word_mode != <MODE>mode)
+    {
+      rtx reg = gen_reg_rtx (word_mode);
+      emit_insn (gen_rtx_SET (reg, gen_rtx_SIGN_EXTEND (word_mode, compare)));
+      compare = reg;
+    }
+
+  emit_insn (gen_rtx_SET (operands[0],
+			  gen_rtx_EQ (SImode, compare, const0_rtx)));
+  DONE;
+})
+
+(define_insn "atomic_cas_value_add_7_<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
+			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
+			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
+			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
+			      (match_operand:SI 6 "const_int_operand")]		;; model
+	 UNSPEC_COMPARE_AND_SWAP_ADD))
+   (clobber (match_scratch:GPR 7 "=&r"))
+   (clobber (match_scratch:GPR 8 "=&r"))]
+  ""
+{
+  return "%G6\\n\\t"
+	 "1:\\n\\t"
+	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "and\\t%7,%0,%3\\n\\t"
+	 "add.w\\t%8,%0,%z5\\n\\t"
+	 "and\\t%8,%8,%z2\\n\\t"
+	 "or%i8\\t%7,%7,%8\\n\\t"
+	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "beq\\t$zero,%7,1b";
+}
+
+  [(set (attr "length") (const_int 32))])
+
+(define_insn "atomic_cas_value_sub_7_<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
+			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
+			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
+			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
+			      (match_operand:SI 6 "const_int_operand")]		;; model
+	 UNSPEC_COMPARE_AND_SWAP_SUB))
+   (clobber (match_scratch:GPR 7 "=&r"))
+   (clobber (match_scratch:GPR 8 "=&r"))]
+  ""
+{
+  return "%G6\\n\\t"
+	 "1:\\n\\t"
+	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "and\\t%7,%0,%3\\n\\t"
+	 "sub.w\\t%8,%0,%z5\\n\\t"
+	 "and\\t%8,%8,%z2\\n\\t"
+	 "or%i8\\t%7,%7,%8\\n\\t"
+	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "beq\\t$zero,%7,1b";
+}
+  [(set (attr "length") (const_int 32))])
+
+(define_insn "atomic_cas_value_and_7_<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
+			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
+			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
+			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
+			      (match_operand:SI 6 "const_int_operand")]		;; model
+	 UNSPEC_COMPARE_AND_SWAP_AND))
+   (clobber (match_scratch:GPR 7 "=&r"))
+   (clobber (match_scratch:GPR 8 "=&r"))]
+  ""
+{
+  return "%G6\\n\\t"
+	 "1:\\n\\t"
+	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "and\\t%7,%0,%3\\n\\t"
+	 "and\\t%8,%0,%z5\\n\\t"
+	 "and\\t%8,%8,%z2\\n\\t"
+	 "or%i8\\t%7,%7,%8\\n\\t"
+	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "beq\\t$zero,%7,1b";
+}
+  [(set (attr "length") (const_int 32))])
+
+(define_insn "atomic_cas_value_xor_7_<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
+			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
+			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
+			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
+			      (match_operand:SI 6 "const_int_operand")]		;; model
+	 UNSPEC_COMPARE_AND_SWAP_XOR))
+   (clobber (match_scratch:GPR 7 "=&r"))
+   (clobber (match_scratch:GPR 8 "=&r"))]
+  ""
+{
+  return "%G6\\n\\t"
+	 "1:\\n\\t"
+	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "and\\t%7,%0,%3\\n\\t"
+	 "xor\\t%8,%0,%z5\\n\\t"
+	 "and\\t%8,%8,%z2\\n\\t"
+	 "or%i8\\t%7,%7,%8\\n\\t"
+	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "beq\\t$zero,%7,1b";
+}
+
+  [(set (attr "length") (const_int 32))])
+
+(define_insn "atomic_cas_value_or_7_<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
+			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
+			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
+			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
+			      (match_operand:SI 6 "const_int_operand")]		;; model
+	 UNSPEC_COMPARE_AND_SWAP_OR))
+   (clobber (match_scratch:GPR 7 "=&r"))
+   (clobber (match_scratch:GPR 8 "=&r"))]
+  ""
+{
+  return "%G6\\n\\t"
+	 "1:\\n\\t"
+	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "and\\t%7,%0,%3\\n\\t"
+	 "or\\t%8,%0,%z5\\n\\t"
+	 "and\\t%8,%8,%z2\\n\\t"
+	 "or%i8\\t%7,%7,%8\\n\\t"
+	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "beq\\t$zero,%7,1b";
+}
+
+  [(set (attr "length") (const_int 32))])
+
+(define_insn "atomic_cas_value_nand_7_<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
+			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
+			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
+			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
+			      (match_operand:SI 6 "const_int_operand")]		;; model
+	 UNSPEC_COMPARE_AND_SWAP_NAND))
+   (clobber (match_scratch:GPR 7 "=&r"))
+   (clobber (match_scratch:GPR 8 "=&r"))]
+  ""
+{
+  return "%G6\\n\\t"
+	 "1:\\n\\t"
+	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "and\\t%7,%0,%3\\n\\t"
+	 "and\\t%8,%0,%z5\\n\\t"
+	 "xor\\t%8,%8,%z2\\n\\t"
+	 "or%i8\\t%7,%7,%8\\n\\t"
+	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "beq\\t$zero,%7,1b";
+}
+  [(set (attr "length") (const_int 32))])
+
+(define_expand "atomic_exchange<mode>"
+  [(set (match_operand:SHORT 0 "register_operand")
+	(unspec_volatile:SHORT
+	  [(match_operand:SHORT 1 "memory_operand")
+	   (match_operand:SI 3 "const_int_operand")] ;; model
+	  UNSPEC_SYNC_EXCHANGE))
+   (set (match_dup 1)
+	(match_operand:SHORT 2 "register_operand"))]
+  ""
+{
+  union loongarch_gen_fn_ptrs generator;
+  generator.fn_7 = gen_atomic_cas_value_cmp_and_7_si;
+  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
+				operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "atomic_fetch_add<mode>"
+  [(set (match_operand:SHORT 0 "register_operand" "=&r")
+	(match_operand:SHORT 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:SHORT
+	  [(plus:SHORT (match_dup 1)
+		       (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
+	   (match_operand:SI 3 "const_int_operand")] ;; model
+	 UNSPEC_SYNC_OLD_OP))]
+  ""
+{
+  union loongarch_gen_fn_ptrs generator;
+  generator.fn_7 = gen_atomic_cas_value_add_7_si;
+  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
+				operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "atomic_fetch_sub<mode>"
+  [(set (match_operand:SHORT 0 "register_operand" "=&r")
+	(match_operand:SHORT 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:SHORT
+	  [(minus:SHORT (match_dup 1)
+			(match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
+	   (match_operand:SI 3 "const_int_operand")] ;; model
+	 UNSPEC_SYNC_OLD_OP))]
+  ""
+{
+  union loongarch_gen_fn_ptrs generator;
+  generator.fn_7 = gen_atomic_cas_value_sub_7_si;
+  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
+				operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "atomic_fetch_and<mode>"
+  [(set (match_operand:SHORT 0 "register_operand" "=&r")
+	(match_operand:SHORT 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:SHORT
+	  [(and:SHORT (match_dup 1)
+		      (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
+	   (match_operand:SI 3 "const_int_operand")] ;; model
+	 UNSPEC_SYNC_OLD_OP))]
+  ""
+{
+  union loongarch_gen_fn_ptrs generator;
+  generator.fn_7 = gen_atomic_cas_value_and_7_si;
+  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
+				operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "atomic_fetch_xor<mode>"
+  [(set (match_operand:SHORT 0 "register_operand" "=&r")
+	(match_operand:SHORT 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:SHORT
+	  [(xor:SHORT (match_dup 1)
+		      (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
+	   (match_operand:SI 3 "const_int_operand")] ;; model
+	 UNSPEC_SYNC_OLD_OP))]
+  ""
+{
+  union loongarch_gen_fn_ptrs generator;
+  generator.fn_7 = gen_atomic_cas_value_xor_7_si;
+  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
+				operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "atomic_fetch_or<mode>"
+  [(set (match_operand:SHORT 0 "register_operand" "=&r")
+	(match_operand:SHORT 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:SHORT
+	  [(ior:SHORT (match_dup 1)
+		      (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
+	   (match_operand:SI 3 "const_int_operand")] ;; model
+	 UNSPEC_SYNC_OLD_OP))]
+  ""
+{
+  union loongarch_gen_fn_ptrs generator;
+  generator.fn_7 = gen_atomic_cas_value_or_7_si;
+  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
+				operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "atomic_fetch_nand<mode>"
+  [(set (match_operand:SHORT 0 "register_operand" "=&r")
+	(match_operand:SHORT 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:SHORT
+	  [(not:SHORT (and:SHORT (match_dup 1)
+				 (match_operand:SHORT 2 "reg_or_0_operand" "rJ")))
+	   (match_operand:SI 3 "const_int_operand")] ;; model
+	 UNSPEC_SYNC_OLD_OP))]
+  ""
+{
+  union loongarch_gen_fn_ptrs generator;
+  generator.fn_7 = gen_atomic_cas_value_nand_7_si;
+  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
+				operands[1], operands[2], operands[3]);
+  DONE;
+})
diff --git a/gcc/config/loongarch/t-linux b/gcc/config/loongarch/t-linux
new file mode 100644
index 0000000..5b9796a
--- /dev/null
+++ b/gcc/config/loongarch/t-linux
@@ -0,0 +1,53 @@
+# Copyright (C) 2021-2022 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# Multilib
+MULTILIB_OPTIONS = mabi=lp64d/mabi=lp64f/mabi=lp64s
+MULTILIB_DIRNAMES = base/lp64d base/lp64f base/lp64s
+
+# The GCC driver always gets all abi-related options on the command line.
+# (see loongarch-driver.c:driver_get_normalized_m_opts)
+comma=,
+MULTILIB_REQUIRED = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))
+
+# Multiarch
+ifneq ($(call if_multiarch,yes),yes)
+    # Define LA_DISABLE_MULTIARCH if multiarch is disabled.
+    tm_defines += LA_DISABLE_MULTIARCH
+else
+    # Only define MULTIARCH_DIRNAME when multiarch is enabled,
+    # or it would always introduce ${target} into the search path.
+    MULTIARCH_DIRNAME = $(LA_MULTIARCH_TRIPLET)
+endif
+
+# Don't define MULTILIB_OSDIRNAMES if multilib is disabled.
+ifeq ($(filter LA_DISABLE_MULTILIB,$(tm_defines)),)
+
+    MULTILIB_OSDIRNAMES = \
+      mabi.lp64d=../lib64$\
+      $(call if_multiarch,:loongarch64-linux-gnuf64)
+
+    MULTILIB_OSDIRNAMES += \
+      mabi.lp64f=../lib64/f32$\
+      $(call if_multiarch,:loongarch64-linux-gnuf32)
+
+    MULTILIB_OSDIRNAMES += \
+      mabi.lp64s=../lib64/sf$\
+      $(call if_multiarch,:loongarch64-linux-gnusf)
+
+endif
diff --git a/gcc/config/loongarch/t-loongarch b/gcc/config/loongarch/t-loongarch
new file mode 100644
index 0000000..6d6e343
--- /dev/null
+++ b/gcc/config/loongarch/t-loongarch
@@ -0,0 +1,71 @@
+# Copyright (C) 2021-2022 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# Canonical target triplet from config.gcc
+LA_MULTIARCH_TRIPLET = $(patsubst LA_MULTIARCH_TRIPLET=%,%,$\
+$(filter LA_MULTIARCH_TRIPLET=%,$(tm_defines)))
+
+# String definition header
+LA_STR_H = $(srcdir)/config/loongarch/loongarch-str.h
+
+# String definition header
+$(LA_STR_H): s-loongarch-str ; @true
+s-loongarch-str: $(srcdir)/config/loongarch/genopts/genstr.sh \
+	$(srcdir)/config/loongarch/genopts/loongarch-strings
+	$(SHELL) $(srcdir)/config/loongarch/genopts/genstr.sh header \
+    $(srcdir)/config/loongarch/genopts/loongarch-strings > \
+    tmp-loongarch-str.h
+	$(SHELL) $(srcdir)/../move-if-change tmp-loongarch-str.h \
+		$(LA_STR_H)
+	$(STAMP) s-loongarch-str
+
+loongarch-c.o: $(srcdir)/config/loongarch/loongarch-c.cc $(CONFIG_H) $(SYSTEM_H) \
+	coretypes.h $(TM_H) $(TREE_H) output.h $(C_COMMON_H) $(TARGET_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+	$(srcdir)/config/loongarch/loongarch-c.cc
+
+loongarch-builtins.o: $(srcdir)/config/loongarch/loongarch-builtins.cc $(CONFIG_H) \
+	$(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(TREE_H) $(RECOG_H) langhooks.h \
+	$(DIAGNOSTIC_CORE_H) $(OPTABS_H) $(srcdir)/config/loongarch/loongarch-ftypes.def \
+	$(srcdir)/config/loongarch/loongarch-modes.def
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+	$(srcdir)/config/loongarch/loongarch-builtins.cc
+
+loongarch-driver.o : $(srcdir)/config/loongarch/loongarch-driver.cc $(LA_STR_H) \
+	$(CONFIG_H) $(SYSTEM_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $<
+
+loongarch-opts.o: $(srcdir)/config/loongarch/loongarch-opts.cc $(LA_STR_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $<
+
+loongarch-cpu.o: $(srcdir)/config/loongarch/loongarch-cpu.cc $(LA_STR_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $<
+
+loongarch-def.o: $(srcdir)/config/loongarch/loongarch-def.c $(LA_STR_H)
+	$(CC) -c $(ALL_CFLAGS) $(INCLUDES) $<
+
+$(srcdir)/config/loongarch/loongarch.opt: s-loongarch-opt ; @true
+s-loongarch-opt: $(srcdir)/config/loongarch/genopts/genstr.sh \
+	$(srcdir)/config/loongarch/genopts/loongarch.opt.in \
+	$(srcdir)/config/loongarch/genopts/loongarch-strings $(LA_STR_H)
+	$(SHELL) $(srcdir)/config/loongarch/genopts/genstr.sh opt \
+    $(srcdir)/config/loongarch/genopts/loongarch.opt.in \
+    > tmp-loongarch.opt
+	$(SHELL) $(srcdir)/../move-if-change tmp-loongarch.opt \
+    $(srcdir)/config/loongarch/loongarch.opt
+	$(STAMP) s-loongarch-opt
diff --git a/gcc/config/m32c/m32c.cc b/gcc/config/m32c/m32c.cc
index 11ca9a4..5a19faa 100644
--- a/gcc/config/m32c/m32c.cc
+++ b/gcc/config/m32c/m32c.cc
@@ -1090,7 +1090,7 @@ static struct
   { FB_REGNO, 0x01, 2, 4 }
 };
 
-#define PUSHM_N (sizeof(pushm_info)/sizeof(pushm_info[0]))
+#define PUSHM_N (ARRAY_SIZE (pushm_info))
 
 /* Returns TRUE if we need to save/restore the given register.  We
    save everything for exception handlers, so that any register can be
diff --git a/gcc/config/mips/mips.cc b/gcc/config/mips/mips.cc
index a1c4b43..5eb8459 100644
--- a/gcc/config/mips/mips.cc
+++ b/gcc/config/mips/mips.cc
@@ -6042,11 +6042,28 @@ mips_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
 	  for (i = 0; i < info.reg_words; i++)
 	    {
 	      rtx reg;
+	      bool zero_width_field_abi_change = false;
 
 	      for (; field; field = DECL_CHAIN (field))
-		if (TREE_CODE (field) == FIELD_DECL
-		    && int_bit_position (field) >= bitpos)
-		  break;
+		{
+		  if (TREE_CODE (field) != FIELD_DECL)
+		    continue;
+
+		  /* Ignore zero-width fields.  And, if the ignored
+		     field is not a C++ zero-width bit-field, it may be
+		     an ABI change.  */
+		  if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
+		    continue;
+		  if (DECL_SIZE (field)
+		      && integer_zerop (DECL_SIZE (field)))
+		    {
+		      zero_width_field_abi_change = true;
+		      continue;
+		    }
+
+		  if (int_bit_position (field) >= bitpos)
+		    break;
+		}
 
 	      if (field
 		  && int_bit_position (field) == bitpos
@@ -6054,7 +6071,29 @@ mips_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
 		  && TYPE_PRECISION (TREE_TYPE (field)) == BITS_PER_WORD)
 		reg = gen_rtx_REG (DFmode, FP_ARG_FIRST + info.reg_offset + i);
 	      else
-		reg = gen_rtx_REG (DImode, GP_ARG_FIRST + info.reg_offset + i);
+		{
+		  reg = gen_rtx_REG (DImode,
+				     GP_ARG_FIRST + info.reg_offset + i);
+		  zero_width_field_abi_change = false;
+		}
+
+	      if (zero_width_field_abi_change && warn_psabi)
+		{
+		  static unsigned last_reported_type_uid;
+		  unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (arg.type));
+		  if (uid != last_reported_type_uid)
+		    {
+		      static const char *url
+			= CHANGES_ROOT_URL
+			  "gcc-12/changes.html#mips_zero_width_fields";
+		      inform (input_location,
+			      "the ABI for passing a value containing "
+			      "zero-width fields before an adjacent "
+			      "64-bit floating-point field was changed "
+			      "in GCC %{12.1%}", url);
+		      last_reported_type_uid = uid;
+		    }
+		}
 
 	      XVECEXP (ret, 0, i)
 		= gen_rtx_EXPR_LIST (VOIDmode, reg,
@@ -6274,10 +6313,26 @@ mips_callee_copies (cumulative_args_t, const function_arg_info &arg)
 
    For n32 & n64, a structure with one or two fields is returned in
    floating-point registers as long as every field has a floating-point
-   type.  */
+   type.
+
+   The C++ FE used to remove zero-width bit-fields in GCC 11 and earlier.
+   To make a proper diagnostic, this function will set
+   HAS_CXX_ZERO_WIDTH_BF to true once a C++ zero-width bit-field shows up,
+   and then ignore it.
+
+   We had failed to ignore C++17 empty bases in GCC 7, 8, 9, 10, and 11.
+   This caused an ABI incompatibility between C++14 and C++17.  This is
+   fixed now and to make a proper diagnostic, this function will set
+   HAS_CXX17_EMPTY_BASE to true once a C++17 empty base shows up, and
+   then ignore it.
+
+   The caller should use the value of HAS_CXX17_EMPTY_BASE and/or
+   HAS_CXX_ZERO_WIDTH_BF to emit a proper -Wpsabi inform.  */
 
 static int
-mips_fpr_return_fields (const_tree valtype, tree *fields)
+mips_fpr_return_fields (const_tree valtype, tree *fields,
+			bool *has_cxx_zero_width_bf,
+			bool *has_cxx17_empty_base)
 {
   tree field;
   int i;
@@ -6294,6 +6349,18 @@ mips_fpr_return_fields (const_tree valtype, tree *fields)
       if (TREE_CODE (field) != FIELD_DECL)
 	continue;
 
+      if (cxx17_empty_base_field_p (field))
+	{
+	  *has_cxx17_empty_base = true;
+	  continue;
+	}
+
+      if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
+	{
+	  *has_cxx_zero_width_bf = true;
+	  continue;
+	}
+
       if (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (field)))
 	return 0;
 
@@ -6318,12 +6385,19 @@ mips_fpr_return_fields (const_tree valtype, tree *fields)
 static bool
 mips_return_in_msb (const_tree valtype)
 {
+  if (!TARGET_NEWABI || !TARGET_BIG_ENDIAN || !AGGREGATE_TYPE_P (valtype))
+    return false;
+
   tree fields[2];
+  bool has_cxx_zero_width_bf = false;
+
+  /* Its value is not used.  */
+  bool has_cxx17_empty_base = false;
 
-  return (TARGET_NEWABI
-	  && TARGET_BIG_ENDIAN
-	  && AGGREGATE_TYPE_P (valtype)
-	  && mips_fpr_return_fields (valtype, fields) == 0);
+  return (mips_fpr_return_fields (valtype, fields,
+				  &has_cxx_zero_width_bf,
+				  &has_cxx17_empty_base) == 0
+	  || has_cxx_zero_width_bf);
 }
 
 /* Return true if the function return value MODE will get returned in a
@@ -6418,8 +6492,63 @@ mips_function_value_1 (const_tree valtype, const_tree fn_decl_or_type,
 	 return values, promote the mode here too.  */
       mode = promote_function_mode (valtype, mode, &unsigned_p, func, 1);
 
+      bool has_cxx_zero_width_bf = false;
+      bool has_cxx17_empty_base = false;
+      int use_fpr = mips_fpr_return_fields (valtype, fields,
+					    &has_cxx_zero_width_bf,
+					    &has_cxx17_empty_base);
+
+      /* If has_cxx_zero_width_bf and has_cxx17_empty_base are both
+	 true, it *happens* that there is no ABI change.  So we won't
+	 inform in this case.  */
+      if (TARGET_HARD_FLOAT
+	  && warn_psabi
+	  && has_cxx_zero_width_bf
+	  && !has_cxx17_empty_base
+	  && use_fpr != 0)
+	{
+	  static unsigned last_reported_type_uid;
+	  unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (valtype));
+	  if (uid != last_reported_type_uid)
+	    {
+	      static const char *url
+		= CHANGES_ROOT_URL
+		  "gcc-12/changes.html#zero_width_bitfields";
+	      inform (input_location,
+		      "the ABI for returning a value containing "
+		      "zero-width bit-fields but otherwise an aggregate "
+		      "with only one or two floating-point fields was "
+		      "changed in GCC %{12.1%}", url);
+	      last_reported_type_uid = uid;
+	    }
+	}
+
+      if (has_cxx_zero_width_bf)
+	use_fpr = 0;
+
+      if (TARGET_HARD_FLOAT
+	  && warn_psabi
+	  && use_fpr != 0
+	  && has_cxx17_empty_base)
+	{
+	  static unsigned last_reported_type_uid;
+	  unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (valtype));
+	  if (uid != last_reported_type_uid)
+	    {
+	      static const char *url
+		= CHANGES_ROOT_URL
+		  "gcc-12/changes.html#mips_cxx17_empty_bases";
+	      inform (input_location,
+		      "the ABI for returning a value with C++17 empty "
+		      "bases but otherwise an aggregate with only one or "
+		      "two floating-point fields was changed in GCC "
+		      "%{12.1%}", url);
+	      last_reported_type_uid = uid;
+	    }
+	}
+
       /* Handle structures whose fields are returned in $f0/$f2.  */
-      switch (mips_fpr_return_fields (valtype, fields))
+      switch (use_fpr)
 	{
 	case 1:
 	  return mips_return_fpr_single (mode,
@@ -12254,10 +12383,22 @@ mips_expand_prologue (void)
 	      /* Insert the RIPL into our copy of SR (k1) as the new IPL.  */
 	      if (!cfun->machine->keep_interrupts_masked_p
 		  && cfun->machine->int_mask == INT_MASK_EIC)
-		emit_insn (gen_insvsi (gen_rtx_REG (SImode, K1_REG_NUM),
-				       GEN_INT (6),
+		{
+		  emit_insn (gen_insvsi (gen_rtx_REG (SImode, K1_REG_NUM),
+				       TARGET_MCU ? GEN_INT (7) : GEN_INT (6),
 				       GEN_INT (SR_IPL),
 				       gen_rtx_REG (SImode, K0_REG_NUM)));
+		  if (TARGET_MCU)
+		    {
+		      emit_insn (gen_lshrsi3 (gen_rtx_REG (SImode, K0_REG_NUM),
+					gen_rtx_REG (SImode, K0_REG_NUM),
+					GEN_INT (7)));
+		      emit_insn (gen_insvsi (gen_rtx_REG (SImode, K1_REG_NUM),
+				       GEN_INT (1),
+				       GEN_INT (SR_IPL+8),
+				       gen_rtx_REG (SImode, K0_REG_NUM)));
+		    }
+		}
 
 	      /* Clear all interrupt mask bits up to and including the
 		 handler's interrupt line.  */
@@ -21649,9 +21790,13 @@ mips_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
 
 static bool
-mips_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
-			       rtx op1, const vec_perm_indices &sel)
+mips_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+			       rtx target, rtx op0, rtx op1,
+			       const vec_perm_indices &sel)
 {
+  if (vmode != op_mode)
+    return false;
+
   struct expand_vec_perm_d d;
   int i, nelt, which;
   unsigned char orig_perm[MAX_VECT_LEN];
diff --git a/gcc/config/nvptx/gen-opt.sh b/gcc/config/nvptx/gen-opt.sh
index 5248ed2..ba04889 100644
--- a/gcc/config/nvptx/gen-opt.sh
+++ b/gcc/config/nvptx/gen-opt.sh
@@ -44,7 +44,7 @@ echo
 cat <<EOF
 Enum
 Name(ptx_isa) Type(int)
-Known PTX ISA versions (for use with the -misa= option):
+Known PTX ISA target architectures (for use with the -misa= option):
 EOF
 
 # Separator.
diff --git a/gcc/config/nvptx/nvptx-c.cc b/gcc/config/nvptx/nvptx-c.cc
index 02f7562..f060a8a 100644
--- a/gcc/config/nvptx/nvptx-c.cc
+++ b/gcc/config/nvptx/nvptx-c.cc
@@ -49,5 +49,14 @@ nvptx_cpu_cpp_builtins (void)
 #include "nvptx-sm.def"
 #undef NVPTX_SM
   cpp_define (parse_in, ptx_sm);
+
+  {
+    unsigned major
+      = ptx_version_to_number ((ptx_version)ptx_version_option, true);
+    unsigned minor
+      = ptx_version_to_number ((ptx_version)ptx_version_option, false);
+    cpp_define_formatted (parse_in, "__PTX_ISA_VERSION_MAJOR__=%u", major);
+    cpp_define_formatted (parse_in, "__PTX_ISA_VERSION_MINOR__=%u", minor);
+  }
 }
 
diff --git a/gcc/config/nvptx/nvptx-gen.opt b/gcc/config/nvptx/nvptx-gen.opt
index b6d433e..0f5889e 100644
--- a/gcc/config/nvptx/nvptx-gen.opt
+++ b/gcc/config/nvptx/nvptx-gen.opt
@@ -21,7 +21,7 @@
 
 Enum
 Name(ptx_isa) Type(int)
-Known PTX ISA versions (for use with the -misa= option):
+Known PTX ISA target architectures (for use with the -misa= option):
 
 EnumValue
 Enum(ptx_isa) String(sm_30) Value(PTX_ISA_SM30)
diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index 0bf9af4..dfa08ec 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -43,6 +43,8 @@ extern void nvptx_output_ascii (FILE *, const char *, unsigned HOST_WIDE_INT);
 extern void nvptx_cpu_cpp_builtins (void);
 extern void nvptx_register_pragmas (void);
 extern unsigned int nvptx_data_alignment (const_tree, unsigned int);
+extern void nvptx_asm_output_def_from_decls (FILE *, tree, tree);
+extern unsigned int ptx_version_to_number (enum ptx_version, bool);
 
 #ifdef RTX_CODE
 extern void nvptx_expand_oacc_fork (unsigned);
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 3a7be63..e4297e2 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -77,6 +77,7 @@
 #include "opts.h"
 #include "tree-pretty-print.h"
 #include "rtl-iter.h"
+#include "cgraph.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -271,6 +272,28 @@ ptx_version_to_string (enum ptx_version v)
     }
 }
 
+unsigned int
+ptx_version_to_number (enum ptx_version v, bool major_p)
+{
+  switch (v)
+    {
+    case PTX_VERSION_3_0:
+      return major_p ? 3 : 0;
+    case PTX_VERSION_3_1:
+      return major_p ? 3 : 1;
+    case PTX_VERSION_4_2:
+      return major_p ? 4 : 2;
+    case PTX_VERSION_6_0:
+      return major_p ? 6 : 0;
+    case PTX_VERSION_6_3:
+      return major_p ? 6 : 3;
+    case PTX_VERSION_7_0:
+      return major_p ? 7 : 0;
+    default:
+      gcc_unreachable ();
+    }
+}
+
 static const char *
 sm_version_to_string (enum ptx_isa sm)
 {
@@ -300,8 +323,8 @@ handle_ptx_version_option (void)
     = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
 
   if (ptx_version_option < first)
-    error ("PTX version (-mptx) needs to be at least %s to support selected"
-	   " -misa (sm_%s)", ptx_version_to_string (first),
+    error ("PTX version (%<-mptx%>) needs to be at least %s to support selected"
+	   " %<-misa%> (sm_%s)", ptx_version_to_string (first),
 	   sm_version_to_string ((enum ptx_isa)ptx_isa_option));
 }
 
@@ -968,7 +991,8 @@ static void
 write_fn_proto_1 (std::stringstream &s, bool is_defn,
 		  const char *name, const_tree decl)
 {
-  write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
+  if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL)
+    write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
 
   /* PTX declaration.  */
   if (DECL_EXTERNAL (decl))
@@ -7154,7 +7178,7 @@ nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
 static bool
 nvptx_scalar_mode_supported_p (scalar_mode mode)
 {
-  if (mode == HFmode && TARGET_SM53)
+  if (nvptx_experimental && mode == HFmode && TARGET_SM53)
     return true;
 
   return default_scalar_mode_supported_p (mode);
@@ -7163,7 +7187,7 @@ nvptx_scalar_mode_supported_p (scalar_mode mode)
 static bool
 nvptx_libgcc_floating_mode_supported_p (scalar_float_mode mode)
 {
-  if (mode == HFmode && TARGET_SM53)
+  if (nvptx_experimental && mode == HFmode && TARGET_SM53)
     return true;
 
   return default_libgcc_floating_mode_supported_p (mode);
@@ -7393,6 +7417,76 @@ nvptx_mem_local_p (rtx mem)
   return false;
 }
 
+/* Define locally, for use in NVPTX_ASM_OUTPUT_DEF.  */
+#define SET_ASM_OP ".alias "
+
+/* Define locally, for use in nvptx_asm_output_def_from_decls.  Add NVPTX_
+   prefix to avoid clash with ASM_OUTPUT_DEF from nvptx.h.
+   Copy of ASM_OUTPUT_DEF from defaults.h, with added terminating
+   semicolon.  */
+#define NVPTX_ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2)	\
+  do							\
+    {							\
+      fprintf ((FILE), "%s", SET_ASM_OP);		\
+      assemble_name (FILE, LABEL1);			\
+      fprintf (FILE, ",");				\
+      assemble_name (FILE, LABEL2);			\
+      fprintf (FILE, ";\n");				\
+    }							\
+  while (0)
+
+void
+nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value)
+{
+  if (nvptx_alias == 0 || !TARGET_PTX_6_3)
+    {
+      /* Copied from assemble_alias.  */
+      error_at (DECL_SOURCE_LOCATION (name),
+		"alias definitions not supported in this configuration");
+      TREE_ASM_WRITTEN (name) = 1;
+      return;
+    }
+
+  if (lookup_attribute ("weak", DECL_ATTRIBUTES (name)))
+    {
+      /* Prevent execution FAILs for gcc.dg/globalalias.c and
+	 gcc.dg/pr77587.c.  */
+      error_at (DECL_SOURCE_LOCATION (name),
+		"weak alias definitions not supported in this configuration");
+      TREE_ASM_WRITTEN (name) = 1;
+      return;
+    }
+
+  /* Ptx also doesn't support value having weak linkage, but we can't detect
+     that here, so we'll end up with:
+     "error: Function test with .weak scope cannot be aliased".
+     See gcc.dg/localalias.c.  */
+
+  if (TREE_CODE (name) != FUNCTION_DECL)
+    {
+      error_at (DECL_SOURCE_LOCATION (name),
+		"non-function alias definitions not supported"
+		" in this configuration");
+      TREE_ASM_WRITTEN (name) = 1;
+      return;
+    }
+
+  if (!cgraph_node::get (name)->referred_to_p ())
+    /* Prevent "Internal error: reference to deleted section".  */
+    return;
+
+  std::stringstream s;
+  write_fn_proto (s, false, get_fnname_from_decl (name), name);
+  fputs (s.str ().c_str (), stream);
+
+  tree id = DECL_ASSEMBLER_NAME (name);
+  NVPTX_ASM_OUTPUT_DEF (stream, IDENTIFIER_POINTER (id),
+			IDENTIFIER_POINTER (value));
+}
+
+#undef NVPTX_ASM_OUTPUT_DEF
+#undef SET_ASM_OP
+
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE nvptx_option_override
 
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index b55ade6..ed72c25 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -29,11 +29,6 @@
 
 #define STARTFILE_SPEC "%{mmainkernel:crt0.o}"
 
-/* Default needs to be in sync with default for misa in nvptx.opt.
-   We add a default here to work around a hard-coded sm_30 default in
-   nvptx-as.  */
-#define ASM_SPEC "%{misa=*:-m %*; :-m sm_35}%{misa=sm_30:--no-verify}"
-
 #define TARGET_CPU_CPP_BUILTINS() nvptx_cpu_cpp_builtins ()
 
 /* Avoid the default in ../../gcc.cc, which adds "-pthread", which is not
@@ -315,6 +310,23 @@ struct GTY(()) machine_function
   ((VALUE) = GET_MODE_BITSIZE ((MODE)), 2)
 
 #define SUPPORTS_WEAK 1
+
+/* The documentation states that ASM_OUTPUT_DEF_FROM_DECLS is used in
+   preference to ASM_OUTPUT_DEF if the tree nodes are available.  However, we
+   need the tree nodes to emit the prototype, so at this point it's not clear
+   how we can support ASM_OUTPUT_DEF.  Still, we need to define it, or
+   ASM_OUTPUT_DEF_FROM_DECLS is ignored.  For now, assert, and once we run
+   into it possibly improve by somehow emitting the prototype elsewhere, or
+   emitting a reasonable error message.  */
+#define ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2)	\
+  do						\
+    {						\
+      gcc_unreachable ();			\
+    }						\
+  while (0)
+#define ASM_OUTPUT_DEF_FROM_DECLS(STREAM, NAME, VALUE)	\
+  nvptx_asm_output_def_from_decls (STREAM, NAME, VALUE)
+
 #define NO_DOT_IN_LABEL
 #define ASM_COMMENT_START "//"
 
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 1dec7ca..8ed6850 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -1881,6 +1881,10 @@
   ""
 {
   emit_insn (gen_omp_simt_exit (Pmode, operands[0]));
+  if (TARGET_PTX_6_0)
+    emit_insn (gen_nvptx_warpsync ());
+  else
+    emit_insn (gen_nvptx_uniform_warp_check ());
   DONE;
 })
 
@@ -2276,13 +2280,14 @@
   {
     const char *insns[] = {
       "{",
-      "\\t"	      ".reg.b32"	"\\t" "act;",
-      "%.\\t"	      "vote.ballot.b32" "\\t" "act,1;",
-      "\\t"	      ".reg.pred"	"\\t" "do_abort;",
-      "\\t"	      "mov.pred"	"\\t" "do_abort,0;",
-      "%.\\t"	      "setp.ne.b32"	"\\t" "do_abort,act,0xffffffff;",
-      "@ do_abort\\t" "trap;",
-      "@ do_abort\\t" "exit;",
+      "\\t"		  ".reg.b32"	    "\\t" "%%r_act;",
+      "%.\\t"		  "vote.ballot.b32" "\\t" "%%r_act,1;",
+      "\\t"		  ".reg.pred"	    "\\t" "%%r_do_abort;",
+      "\\t"		  "mov.pred"	    "\\t" "%%r_do_abort,0;",
+      "%.\\t"		  "setp.ne.b32"	    "\\t" "%%r_do_abort,%%r_act,"
+						  "0xffffffff;",
+      "@ %%r_do_abort\\t" "trap;",
+      "@ %%r_do_abort\\t" "exit;",
       "}",
       NULL
     };
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index fea99c5..c5a5668 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -25,7 +25,8 @@
 
 m64
 Target RejectNegative Mask(ABI64)
-Generate code for a 64-bit ABI.
+Ignored, but preserved for backward compatibility.  Only 64-bit ABI is
+supported.
 
 mmainkernel
 Target RejectNegative
@@ -51,14 +52,68 @@ mgomp
 Target Mask(GOMP)
 Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt.
 
-; Default needs to be in sync with default in ASM_SPEC in nvptx.h.
 misa=
 Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM30)
-Specify the version of the ptx ISA to use.
+Specify the PTX ISA target architecture to use.
+
+march=
+Target RejectNegative Joined Alias(misa=)
+Alias:
+
+march-map=sm_30
+Target RejectNegative Alias(misa=,sm_30)
+
+march-map=sm_32
+Target RejectNegative Alias(misa=,sm_30)
+
+march-map=sm_35
+Target RejectNegative Alias(misa=,sm_35)
+
+march-map=sm_37
+Target RejectNegative Alias(misa=,sm_35)
+
+march-map=sm_50
+Target RejectNegative Alias(misa=,sm_35)
+
+march-map=sm_52
+Target RejectNegative Alias(misa=,sm_35)
+
+march-map=sm_53
+Target RejectNegative Alias(misa=,sm_53)
+
+march-map=sm_60
+Target RejectNegative Alias(misa=,sm_53)
+
+march-map=sm_61
+Target RejectNegative Alias(misa=,sm_53)
+
+march-map=sm_62
+Target RejectNegative Alias(misa=,sm_53)
+
+march-map=sm_70
+Target RejectNegative Alias(misa=,sm_70)
+
+march-map=sm_72
+Target RejectNegative Alias(misa=,sm_70)
+
+march-map=sm_75
+Target RejectNegative Alias(misa=,sm_75)
+
+march-map=sm_80
+Target RejectNegative Alias(misa=,sm_80)
+
+march-map=sm_86
+Target RejectNegative Alias(misa=,sm_80)
+
+march-map=sm_87
+Target RejectNegative Alias(misa=,sm_80)
+
+march-map=sm_90
+Target RejectNegative Alias(misa=,sm_80)
 
 Enum
 Name(ptx_version) Type(int)
-Known PTX versions (for use with the -mptx= option):
+Known PTX ISA versions (for use with the -mptx= option):
 
 EnumValue
 Enum(ptx_version) String(3.1) Value(PTX_VERSION_3_1)
@@ -77,7 +132,7 @@ Enum(ptx_version) String(_) Value(PTX_VERSION_default)
 
 mptx=
 Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option)
-Specify the version of the ptx version to use.
+Specify the PTX ISA version to use.
 
 minit-regs=
 Target Var(nvptx_init_regs) IntegerRange(0, 3) Joined UInteger Init(3)
@@ -85,3 +140,9 @@ Initialize ptx registers.
 
 mptx-comment
 Target Var(nvptx_comment) Init(1) Undocumented
+
+malias
+Target Var(nvptx_alias) Init(0) Undocumented
+
+mexperimental
+Target Var(nvptx_experimental) Init(0) Undocumented
diff --git a/gcc/config/nvptx/t-nvptx b/gcc/config/nvptx/t-nvptx
index b63c4a5..2b68149 100644
--- a/gcc/config/nvptx/t-nvptx
+++ b/gcc/config/nvptx/t-nvptx
@@ -23,7 +23,8 @@ s-nvptx-gen-h: $(srcdir)/config/nvptx/nvptx-sm.def
 	$(STAMP) s-nvptx-gen-h
 
 $(srcdir)/config/nvptx/nvptx-gen.opt: s-nvptx-gen-opt; @true
-s-nvptx-gen-opt: $(srcdir)/config/nvptx/nvptx-sm.def
+s-nvptx-gen-opt: $(srcdir)/config/nvptx/nvptx-sm.def \
+  $(srcdir)/config/nvptx/gen-opt.sh
 	$(SHELL) $(srcdir)/config/nvptx/gen-opt.sh "$(srcdir)/config/nvptx" \
 	  > tmp-nvptx-gen.opt
 	$(SHELL) $(srcdir)/../move-if-change \
diff --git a/gcc/config/riscv/arch-canonicalize b/gcc/config/riscv/arch-canonicalize
index 49a6204..fd7651a 100755
--- a/gcc/config/riscv/arch-canonicalize
+++ b/gcc/config/riscv/arch-canonicalize
@@ -20,44 +20,63 @@
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
+# TODO: Extract riscv_subset_t from riscv-common.cc and make it can be compiled
+#       standalone to replace this script, that also prevents us implementing
+#       that twice and keep sync again and again.
 
 from __future__ import print_function
 import sys
+import argparse
 import collections
 import itertools
 from functools import reduce
 
-
-CANONICAL_ORDER = "imafdgqlcbjktpvn"
+SUPPORTED_ISA_SPEC = ["2.2", "20190608", "20191213"]
+CANONICAL_ORDER = "imafdgqlcbkjtpvn"
 LONG_EXT_PREFIXES = ['z', 's', 'h', 'x']
 
 #
 # IMPLIED_EXT(ext) -> implied extension list.
 #
 IMPLIED_EXT = {
-  "d" : ["f"],
-  "zk" : ["zkn"],
-  "zk" : ["zkr"],
-  "zk" : ["zkt"],
-  "zkn" : ["zbkb"],
-  "zkn" : ["zbkc"],
-  "zkn" : ["zbkx"],
-  "zkn" : ["zkne"],
-  "zkn" : ["zknd"],
-  "zkn" : ["zknh"],
-  "zks" : ["zbkb"],
-  "zks" : ["zbkc"],
-  "zks" : ["zbkx"],
-  "zks" : ["zksed"],
-  "zks" : ["zksh"],
+  "d" : ["f", "zicsr"],
+  "f" : ["zicsr"],
+  "zk" : ["zkn", "zkr", "zkt"],
+  "zkn" : ["zbkb", "zbkc", "zbkx", "zkne", "zknd", "zknh"],
+  "zks" : ["zbkb", "zbkc", "zbkx", "zksed", "zksh"],
+
+  "v" : ["zvl128b", "zve64d"],
+  "zve32x" : ["zvl32b"],
+  "zve64x" : ["zve32x", "zvl64b"],
+  "zve32f" : ["f", "zve32x"],
+  "zve64f" : ["f", "zve32f", "zve64x"],
+  "zve64d" : ["d", "zve64f"],
+
+  "zvl64b" : ["zvl32b"],
+  "zvl128b" : ["zvl64b"],
+  "zvl256b" : ["zvl128b"],
+  "zvl512b" : ["zvl256b"],
+  "zvl1024b" : ["zvl512b"],
+  "zvl2048b" : ["zvl1024b"],
+  "zvl4096b" : ["zvl2048b"],
+  "zvl8192b" : ["zvl4096b"],
+  "zvl16384b" : ["zvl8192b"],
+  "zvl32768b" : ["zvl16384b"],
+  "zvl65536b" : ["zvl32768b"],
 }
 
-def arch_canonicalize(arch):
+def arch_canonicalize(arch, isa_spec):
   # TODO: Support extension version.
+  is_isa_spec_2p2 = isa_spec == '2.2'
   new_arch = ""
+  extra_long_ext = []
+  std_exts = []
   if arch[:5] in ['rv32e', 'rv32i', 'rv32g', 'rv64i', 'rv64g']:
-    # TODO: We should expand g to imad_zifencei once we support newer spec.
-    new_arch = arch[:5].replace("g", "imafd")
+    new_arch = arch[:5].replace("g", "i")
+    if arch[:5] in ['rv32g', 'rv64g']:
+      std_exts = ['m', 'a', 'f', 'd']
+      if not is_isa_spec_2p2:
+        extra_long_ext = ['zicsr', 'zifencei']
   else:
     raise Exception("Unexpected arch: `%s`" % arch[:5])
 
@@ -69,20 +88,29 @@ def arch_canonicalize(arch):
   if long_ext_prefixes_idx:
     first_long_ext_idx = min(long_ext_prefixes_idx)
     long_exts = arch[first_long_ext_idx:].split("_")
-    std_exts = list(arch[5:first_long_ext_idx])
+    std_exts += list(arch[5:first_long_ext_idx])
   else:
     long_exts = []
-    std_exts = list(arch[5:])
+    std_exts += list(arch[5:])
+
+  long_exts += extra_long_ext
 
   #
   # Handle implied extensions.
   #
-  for ext in std_exts + long_exts:
-    if ext in IMPLIED_EXT:
-      implied_exts = IMPLIED_EXT[ext]
-      for implied_ext in implied_exts:
-        if implied_ext not in std_exts + long_exts:
-          long_exts.append(implied_ext)
+  any_change = True
+  while any_change:
+    any_change = False
+    for ext in std_exts + long_exts:
+      if ext in IMPLIED_EXT:
+        implied_exts = IMPLIED_EXT[ext]
+        for implied_ext in implied_exts:
+          if implied_ext == 'zicsr' and is_isa_spec_2p2:
+              continue
+
+          if implied_ext not in std_exts + long_exts:
+            long_exts.append(implied_ext)
+            any_change = True
 
   # Single letter extension might appear in the long_exts list,
   # becasue we just append extensions list to the arch string.
@@ -99,6 +127,9 @@ def arch_canonicalize(arch):
     return (exts.startswith("x"), exts.startswith("zxm"),
             LONG_EXT_PREFIXES.index(exts[0]), canonical_sort, exts[1:])
 
+  # Removing duplicates.
+  long_exts = list(set(long_exts))
+
   # Multi-letter extension must be in lexicographic order.
   long_exts = list(sorted(filter(lambda x:len(x) != 1, long_exts),
                           key=longext_sort))
@@ -118,11 +149,20 @@ def arch_canonicalize(arch):
   # Concat rest of the multi-char extensions.
   if long_exts:
     new_arch += "_" + "_".join(long_exts)
+
   return new_arch
 
 if len(sys.argv) < 2:
   print ("Usage: %s <arch_str> [<arch_str>*]" % sys.argv)
   sys.exit(1)
 
-for arg in sys.argv[1:]:
-  print (arch_canonicalize(arg))
+parser = argparse.ArgumentParser()
+parser.add_argument('-misa-spec', type=str,
+                    default='20191213',
+                    choices=SUPPORTED_ISA_SPEC)
+parser.add_argument('arch_strs', nargs=argparse.REMAINDER)
+
+args = parser.parse_args()
+
+for arch in args.arch_strs:
+  print (arch_canonicalize(arch, args.misa_spec))
diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index 0ab9ffe..6c1ccc6 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -79,6 +79,50 @@
   [(set_attr "type" "bitmanip")
    (set_attr "mode" "DI")])
 
+;; During combine, we may encounter an attempt to combine
+;;   slli rtmp, rs, #imm
+;;   zext.w rtmp, rtmp
+;;   sh[123]add rd, rtmp, rs2
+;; which will lead to the immediate not satisfying the above constraints.
+;; By splitting the compound expression, we can simplify to a slli and a
+;; sh[123]add.uw.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(plus:DI (and:DI (ashift:DI (match_operand:DI 1 "register_operand")
+				    (match_operand:QI 2 "immediate_operand"))
+			 (match_operand:DI 3 "consecutive_bits_operand"))
+		 (match_operand:DI 4 "register_operand")))
+   (clobber (match_operand:DI 5 "register_operand"))]
+  "TARGET_64BIT && TARGET_ZBA"
+  [(set (match_dup 5) (ashift:DI (match_dup 1) (match_dup 6)))
+   (set (match_dup 0) (plus:DI (and:DI (ashift:DI (match_dup 5)
+						  (match_dup 7))
+				       (match_dup 8))
+			       (match_dup 4)))]
+{
+	unsigned HOST_WIDE_INT mask = UINTVAL (operands[3]);
+	/* scale: shift within the sh[123]add.uw */
+	int scale = 32 - clz_hwi (mask);
+	/* bias:  pre-scale amount (i.e. the prior shift amount) */
+	int bias = ctz_hwi (mask) - scale;
+
+	/* If the bias + scale don't add up to operand[2], reject. */
+	if ((scale + bias) != UINTVAL (operands[2]))
+	   FAIL;
+
+	/* If the shift-amount is out-of-range for sh[123]add.uw, reject. */
+	if ((scale < 1) || (scale > 3))
+	   FAIL;
+
+	/* If there's no bias, the '*shNadduw' pattern should have matched. */
+	if (bias == 0)
+	   FAIL;
+
+	operands[6] = GEN_INT (bias);
+	operands[7] = GEN_INT (scale);
+	operands[8] = GEN_INT (0xffffffffULL << scale);
+})
+
 (define_insn "*add.uw"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(plus:DI (zero_extend:DI
diff --git a/gcc/config/riscv/multilib-generator b/gcc/config/riscv/multilib-generator
index 1ea2fb2..36698d4 100755
--- a/gcc/config/riscv/multilib-generator
+++ b/gcc/config/riscv/multilib-generator
@@ -46,16 +46,18 @@ import argparse
 # TODO: Add test for this script.
 #
 
+SUPPORTED_ISA_SPEC = ["2.2", "20190608", "20191213"]
 arches = collections.OrderedDict()
 abis = collections.OrderedDict()
 required = []
 reuse = []
 
-def arch_canonicalize(arch):
+def arch_canonicalize(arch, isa_spec):
   this_file = os.path.abspath(os.path.join( __file__))
   arch_can_script = \
     os.path.join(os.path.dirname(this_file), "arch-canonicalize")
-  proc = subprocess.Popen([sys.executable, arch_can_script, arch],
+  proc = subprocess.Popen([sys.executable, arch_can_script,
+                          '-misa-spec=%s' % isa_spec, arch],
                           stdout=subprocess.PIPE)
   out, err = proc.communicate()
   return out.decode().strip()
@@ -133,6 +135,9 @@ options = filter(lambda x:x.startswith("--"), sys.argv[1:])
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--cmodel", type=str)
+parser.add_argument('-misa-spec', type=str,
+                    default='20191213',
+                    choices=SUPPORTED_ISA_SPEC)
 parser.add_argument("cfgs", type=str, nargs='*')
 args = parser.parse_args()
 
@@ -158,13 +163,14 @@ for cmodel in cmodels:
     if cmodel == "compact" and arch.startswith("rv32"):
       continue
 
-    arch = arch_canonicalize (arch)
+    arch = arch_canonicalize (arch, args.misa_spec)
     arches[arch] = 1
     abis[abi] = 1
     extra = list(filter(None, extra.split(',')))
     ext_combs = expand_combination(ext)
     alts = sum([[x] + [x + y for y in ext_combs] for x in [arch] + extra], [])
-    alts = list(map(arch_canonicalize, alts))
+    alts = filter(lambda x: len(x) != 0, alts)
+    alts = list(map(lambda a : arch_canonicalize(a, args.misa_spec), alts))
 
     # Drop duplicated entry.
     alts = unique(alts)
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 97cdbdf..90db5df 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -52,7 +52,7 @@
        (match_test "INTVAL (op) + 1 != 0")))
 
 (define_predicate "const_0_operand"
-  (and (match_code "const_int,const_wide_int,const_double,const_vector")
+  (and (match_code "const_int,const_wide_int,const_vector")
        (match_test "op == CONST0_RTX (GET_MODE (op))")))
 
 (define_predicate "reg_or_0_operand"
@@ -239,3 +239,18 @@
 (define_predicate "const63_operand"
   (and (match_code "const_int")
        (match_test "INTVAL (op) == 63")))
+
+(define_predicate "imm5_operand"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) < 5")))
+
+;; A CONST_INT operand that consists of a single run of consecutive set bits.
+(define_predicate "consecutive_bits_operand"
+  (match_code "const_int")
+{
+	unsigned HOST_WIDE_INT val = UINTVAL (op);
+	if (exact_log2 ((val >> ctz_hwi (val)) + 1) < 0)
+	        return false;
+
+	return true;
+})
diff --git a/gcc/config/riscv/riscv-builtins.cc b/gcc/config/riscv/riscv-builtins.cc
index 0658f8d..795132a 100644
--- a/gcc/config/riscv/riscv-builtins.cc
+++ b/gcc/config/riscv/riscv-builtins.cc
@@ -87,6 +87,18 @@ struct riscv_builtin_description {
 
 AVAIL (hard_float, TARGET_HARD_FLOAT)
 
+
+AVAIL (clean32, TARGET_ZICBOM && !TARGET_64BIT)
+AVAIL (clean64, TARGET_ZICBOM && TARGET_64BIT)
+AVAIL (flush32, TARGET_ZICBOM && !TARGET_64BIT)
+AVAIL (flush64, TARGET_ZICBOM && TARGET_64BIT)
+AVAIL (inval32, TARGET_ZICBOM && !TARGET_64BIT)
+AVAIL (inval64, TARGET_ZICBOM && TARGET_64BIT)
+AVAIL (zero32,  TARGET_ZICBOZ && !TARGET_64BIT)
+AVAIL (zero64,  TARGET_ZICBOZ && TARGET_64BIT)
+AVAIL (prefetchi32, TARGET_ZICBOP && !TARGET_64BIT)
+AVAIL (prefetchi64, TARGET_ZICBOP && TARGET_64BIT)
+
 /* Construct a riscv_builtin_description from the given arguments.
 
    INSN is the name of the associated instruction pattern, without the
@@ -119,6 +131,8 @@ AVAIL (hard_float, TARGET_HARD_FLOAT)
 /* Argument types.  */
 #define RISCV_ATYPE_VOID void_type_node
 #define RISCV_ATYPE_USI unsigned_intSI_type_node
+#define RISCV_ATYPE_SI intSI_type_node
+#define RISCV_ATYPE_DI intDI_type_node
 
 /* RISCV_FTYPE_ATYPESN takes N RISCV_FTYPES-like type codes and lists
    their associated RISCV_ATYPEs.  */
@@ -128,6 +142,8 @@ AVAIL (hard_float, TARGET_HARD_FLOAT)
   RISCV_ATYPE_##A, RISCV_ATYPE_##B
 
 static const struct riscv_builtin_description riscv_builtins[] = {
+  #include "riscv-cmo.def"
+
   DIRECT_BUILTIN (frflags, RISCV_USI_FTYPE, hard_float),
   DIRECT_NO_TARGET_BUILTIN (fsflags, RISCV_VOID_FTYPE_USI, hard_float)
 };
diff --git a/gcc/config/riscv/riscv-c.cc b/gcc/config/riscv/riscv-c.cc
index 73c62f4..eb7ef09 100644
--- a/gcc/config/riscv/riscv-c.cc
+++ b/gcc/config/riscv/riscv-c.cc
@@ -104,6 +104,24 @@ riscv_cpu_cpp_builtins (cpp_reader *pfile)
 
     }
 
+  if (TARGET_MIN_VLEN != 0)
+    builtin_define_with_int_value ("__riscv_v_min_vlen", TARGET_MIN_VLEN);
+
+  if (TARGET_VECTOR_ELEN_64)
+    builtin_define_with_int_value ("__riscv_v_elen", 64);
+  else if (TARGET_VECTOR_ELEN_32)
+    builtin_define_with_int_value ("__riscv_v_elen", 32);
+
+  if (TARGET_VECTOR_ELEN_FP_64)
+    builtin_define_with_int_value ("__riscv_v_elen_fp", 64);
+  else if (TARGET_VECTOR_ELEN_FP_32)
+    builtin_define_with_int_value ("__riscv_v_elen_fp", 32);
+  else if (TARGET_MIN_VLEN != 0)
+    builtin_define_with_int_value ("__riscv_v_elen_fp", 0);
+
+  if (TARGET_MIN_VLEN)
+    builtin_define ("__riscv_vector");
+
   /* Define architecture extension test macros.  */
   builtin_define_with_int_value ("__riscv_arch_test", 1);
 
diff --git a/gcc/config/riscv/riscv-cmo.def b/gcc/config/riscv/riscv-cmo.def
new file mode 100644
index 0000000..b30ecf9
--- /dev/null
+++ b/gcc/config/riscv/riscv-cmo.def
@@ -0,0 +1,17 @@
+// zicbom
+RISCV_BUILTIN (clean_si, "zicbom_cbo_clean", RISCV_BUILTIN_DIRECT, RISCV_SI_FTYPE, clean32),
+RISCV_BUILTIN (clean_di, "zicbom_cbo_clean", RISCV_BUILTIN_DIRECT, RISCV_DI_FTYPE, clean64),
+
+RISCV_BUILTIN (flush_si, "zicbom_cbo_flush", RISCV_BUILTIN_DIRECT, RISCV_SI_FTYPE, flush32),
+RISCV_BUILTIN (flush_di, "zicbom_cbo_flush", RISCV_BUILTIN_DIRECT, RISCV_DI_FTYPE, flush64),
+
+RISCV_BUILTIN (inval_si, "zicbom_cbo_inval", RISCV_BUILTIN_DIRECT, RISCV_SI_FTYPE, inval32),
+RISCV_BUILTIN (inval_di, "zicbom_cbo_inval", RISCV_BUILTIN_DIRECT, RISCV_DI_FTYPE, inval64),
+
+// zicboz
+RISCV_BUILTIN (zero_si, "zicboz_cbo_zero", RISCV_BUILTIN_DIRECT, RISCV_SI_FTYPE, zero32),
+RISCV_BUILTIN (zero_di, "zicboz_cbo_zero", RISCV_BUILTIN_DIRECT, RISCV_DI_FTYPE, zero64),
+
+// zicbop
+RISCV_BUILTIN (prefetchi_si, "zicbop_cbo_prefetchi", RISCV_BUILTIN_DIRECT, RISCV_SI_FTYPE_SI, prefetchi32),
+RISCV_BUILTIN (prefetchi_di, "zicbop_cbo_prefetchi", RISCV_BUILTIN_DIRECT, RISCV_DI_FTYPE_DI, prefetchi64),
diff --git a/gcc/config/riscv/riscv-ftypes.def b/gcc/config/riscv/riscv-ftypes.def
index 2214c49..6242129 100644
--- a/gcc/config/riscv/riscv-ftypes.def
+++ b/gcc/config/riscv/riscv-ftypes.def
@@ -28,3 +28,7 @@ along with GCC; see the file COPYING3.  If not see
 
 DEF_RISCV_FTYPE (0, (USI))
 DEF_RISCV_FTYPE (1, (VOID, USI))
+DEF_RISCV_FTYPE (0, (SI))
+DEF_RISCV_FTYPE (0, (DI))
+DEF_RISCV_FTYPE (1, (SI, SI))
+DEF_RISCV_FTYPE (1, (DI, DI))
diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
index 929e4e3..1e153b3 100644
--- a/gcc/config/riscv/riscv-opts.h
+++ b/gcc/config/riscv/riscv-opts.h
@@ -105,10 +105,19 @@ enum stack_protector_guard {
 #define TARGET_ZKSH   ((riscv_zk_subext & MASK_ZKSH) != 0)
 #define TARGET_ZKT    ((riscv_zk_subext & MASK_ZKT) != 0)
 
-#define MASK_VECTOR_EEW_32    (1 << 0)
-#define MASK_VECTOR_EEW_64    (1 << 1)
-#define MASK_VECTOR_EEW_FP_32 (1 << 2)
-#define MASK_VECTOR_EEW_FP_64 (1 << 3)
+#define MASK_VECTOR_ELEN_32    (1 << 0)
+#define MASK_VECTOR_ELEN_64    (1 << 1)
+#define MASK_VECTOR_ELEN_FP_32 (1 << 2)
+#define MASK_VECTOR_ELEN_FP_64 (1 << 3)
+
+#define TARGET_VECTOR_ELEN_32 \
+  ((riscv_vector_elen_flags & MASK_VECTOR_ELEN_32) != 0)
+#define TARGET_VECTOR_ELEN_64 \
+  ((riscv_vector_elen_flags & MASK_VECTOR_ELEN_64) != 0)
+#define TARGET_VECTOR_ELEN_FP_32 \
+  ((riscv_vector_elen_flags & MASK_VECTOR_ELEN_FP_32) != 0)
+#define TARGET_VECTOR_ELEN_FP_64 \
+  ((riscv_vector_elen_flags & MASK_VECTOR_ELEN_FP_64) != 0)
 
 #define MASK_ZVL32B    (1 <<  0)
 #define MASK_ZVL64B    (1 <<  1)
@@ -136,4 +145,20 @@ enum stack_protector_guard {
 #define TARGET_ZVL32768B ((riscv_zvl_flags & MASK_ZVL32768B) != 0)
 #define TARGET_ZVL65536B ((riscv_zvl_flags & MASK_ZVL65536B) != 0)
 
+#define MASK_ZICBOZ   (1 << 0)
+#define MASK_ZICBOM   (1 << 1)
+#define MASK_ZICBOP   (1 << 2)
+
+#define TARGET_ZICBOZ ((riscv_zicmo_subext & MASK_ZICBOZ) != 0)
+#define TARGET_ZICBOM ((riscv_zicmo_subext & MASK_ZICBOM) != 0)
+#define TARGET_ZICBOP ((riscv_zicmo_subext & MASK_ZICBOP) != 0)
+
+/* Bit of riscv_zvl_flags will set contintuly, N-1 bit will set if N-bit is
+   set, e.g. MASK_ZVL64B has set then MASK_ZVL32B is set, so we can use
+   popcount to caclulate the minimal VLEN.  */
+#define TARGET_MIN_VLEN \
+  ((riscv_zvl_flags == 0) \
+   ? 0 \
+   : 32 << (__builtin_popcount (riscv_zvl_flags) - 1))
+
 #endif /* ! GCC_RISCV_OPTS_H */
diff --git a/gcc/config/riscv/riscv-subset.h b/gcc/config/riscv/riscv-subset.h
index 4f3556a..c592650 100644
--- a/gcc/config/riscv/riscv-subset.h
+++ b/gcc/config/riscv/riscv-subset.h
@@ -68,6 +68,7 @@ private:
 				     const char *);
 
   void handle_implied_ext (riscv_subset_t *);
+  void handle_combine_ext ();
 
 public:
   ~riscv_subset_list ();
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 7da9d37..2e83ca0 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -220,6 +220,7 @@ struct riscv_tune_param
   unsigned short issue_rate;
   unsigned short branch_cost;
   unsigned short memory_cost;
+  unsigned short fmv_cost;
   bool slow_unaligned_access;
 };
 
@@ -285,6 +286,7 @@ static const struct riscv_tune_param rocket_tune_info = {
   1,						/* issue_rate */
   3,						/* branch_cost */
   5,						/* memory_cost */
+  8,						/* fmv_cost */
   true,						/* slow_unaligned_access */
 };
 
@@ -298,6 +300,7 @@ static const struct riscv_tune_param sifive_7_tune_info = {
   2,						/* issue_rate */
   4,						/* branch_cost */
   3,						/* memory_cost */
+  8,						/* fmv_cost */
   true,						/* slow_unaligned_access */
 };
 
@@ -311,6 +314,7 @@ static const struct riscv_tune_param thead_c906_tune_info = {
   1,            /* issue_rate */
   3,            /* branch_cost */
   5,            /* memory_cost */
+  8,		/* fmv_cost */
   false,            /* slow_unaligned_access */
 };
 
@@ -324,6 +328,7 @@ static const struct riscv_tune_param optimize_size_tune_info = {
   1,						/* issue_rate */
   1,						/* branch_cost */
   2,						/* memory_cost */
+  8,						/* fmv_cost */
   false,					/* slow_unaligned_access */
 };
 
@@ -415,6 +420,15 @@ riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
       /* Simply BSETI.  */
       codes[0].code = UNKNOWN;
       codes[0].value = value;
+
+      /* RISC-V sign-extends all 32bit values that live in a 32bit
+	 register.  To avoid paradoxes, we thus need to use the
+	 sign-extended (negative) representation (-1 << 31) for the
+	 value, if we want to build (1 << 31) in SImode.  This will
+	 then expand to an LUI instruction.  */
+      if (mode == SImode && value == (HOST_WIDE_INT_1U << 31))
+	codes[0].value = (HOST_WIDE_INT_M1U << 31);
+
       return 1;
     }
 
@@ -1797,6 +1811,12 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
     case SYMBOL_REF:
     case LABEL_REF:
     case CONST_DOUBLE:
+      /* With TARGET_SUPPORTS_WIDE_INT const int can't be in CONST_DOUBLE
+	 rtl object. Weird recheck due to switch-case fall through above.  */
+      if (GET_CODE (x) == CONST_DOUBLE)
+	gcc_assert (GET_MODE (x) != VOIDmode);
+      /* Fall through.  */
+
     case CONST:
       if ((cost = riscv_const_insns (x)) > 0)
 	{
@@ -2918,8 +2938,8 @@ riscv_pass_aggregate_in_fpr_pair_p (const_tree type,
 
   if ((n_old != n_new) && (warned == 0))
     {
-      warning (0, "ABI for flattened struct with zero-length bit-fields "
-	       "changed in GCC 10");
+      warning (OPT_Wpsabi, "ABI for flattened struct with zero-length "
+			   "bit-fields changed in GCC 10");
       warned = 1;
     }
 
@@ -2960,8 +2980,8 @@ riscv_pass_aggregate_in_fpr_and_gpr_p (const_tree type,
 	   && (num_int_old != num_int_new || num_float_old != num_float_new)))
       && (warned == 0))
     {
-      warning (0, "ABI for flattened struct with zero-length bit-fields "
-	       "changed in GCC 10");
+      warning (OPT_Wpsabi, "ABI for flattened struct with zero-length "
+			   "bit-fields changed in GCC 10");
       warned = 1;
     }
 
@@ -4737,6 +4757,10 @@ static int
 riscv_register_move_cost (machine_mode mode,
 			  reg_class_t from, reg_class_t to)
 {
+  if ((from == FP_REGS && to == GR_REGS) ||
+      (from == GR_REGS && to == FP_REGS))
+    return tune_param->fmv_cost;
+
   return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2;
 }
 
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 8a4d2cf..6f7f4d3 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -528,13 +528,10 @@ enum reg_class
   (((VALUE) | ((1UL<<31) - IMM_REACH)) == ((1UL<<31) - IMM_REACH)	\
    || ((VALUE) | ((1UL<<31) - IMM_REACH)) + IMM_REACH == 0)
 
-/* If this is a single bit mask, then we can load it with bseti.  But this
-   is not useful for any of the low 31 bits because we can use addi or lui
-   to load them.  It is wrong for loading SImode 0x80000000 on rv64 because it
-   needs to be sign-extended.  So we restrict this to the upper 32-bits
-   only.  */
-#define SINGLE_BIT_MASK_OPERAND(VALUE) \
-  (pow2p_hwi (VALUE) && (ctz_hwi (VALUE) >= 32))
+/* If this is a single bit mask, then we can load it with bseti.  Special
+   handling of SImode 0x80000000 on RV64 is done in riscv_build_integer_1. */
+#define SINGLE_BIT_MASK_OPERAND(VALUE)					\
+  (pow2p_hwi (VALUE))
 
 /* Stack layout; function entry, exit and calling.  */
 
@@ -1004,4 +1001,11 @@ extern void riscv_remove_unneeded_save_restore_calls (void);
 
 #define HARD_REGNO_RENAME_OK(FROM, TO) riscv_hard_regno_rename_ok (FROM, TO)
 
+#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
+  ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
+#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
+  ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
+
+#define TARGET_SUPPORTS_WIDE_INT 1
+
 #endif /* ! GCC_RISCV_H */
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index b3c5bce..308b64d 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -42,6 +42,8 @@
   UNSPEC_COPYSIGN
   UNSPEC_LRINT
   UNSPEC_LROUND
+  UNSPEC_FMIN
+  UNSPEC_FMAX
 
   ;; Stack tie
   UNSPEC_TIE
@@ -69,6 +71,13 @@
   ;; Stack Smash Protector
   UNSPEC_SSP_SET
   UNSPEC_SSP_TEST
+
+  ;; CMO instructions.
+  UNSPECV_CLEAN
+  UNSPECV_FLUSH
+  UNSPECV_INVAL
+  UNSPECV_ZERO
+  UNSPECV_PREI
 ])
 
 (define_constants
@@ -222,8 +231,6 @@
 
 	  (eq_attr "got" "load") (const_int 8)
 
-	  (eq_attr "type" "fcmp") (const_int 8)
-
 	  ;; SHIFT_SHIFTs are decomposed into two separate instructions.
 	  (eq_attr "move_type" "shift_shift")
 		(const_int 8)
@@ -1216,6 +1223,26 @@
 ;;
 ;;  ....................
 
+(define_insn "fmin<mode>3"
+  [(set (match_operand:ANYF                    0 "register_operand" "=f")
+	(unspec:ANYF [(use (match_operand:ANYF 1 "register_operand" " f"))
+		      (use (match_operand:ANYF 2 "register_operand" " f"))]
+		     UNSPEC_FMIN))]
+  "TARGET_HARD_FLOAT && !HONOR_SNANS (<MODE>mode)"
+  "fmin.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fmove")
+   (set_attr "mode" "<UNITMODE>")])
+
+(define_insn "fmax<mode>3"
+  [(set (match_operand:ANYF                    0 "register_operand" "=f")
+	(unspec:ANYF [(use (match_operand:ANYF 1 "register_operand" " f"))
+		      (use (match_operand:ANYF 2 "register_operand" " f"))]
+		     UNSPEC_FMAX))]
+  "TARGET_HARD_FLOAT && !HONOR_SNANS (<MODE>mode)"
+  "fmax.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fmove")
+   (set_attr "mode" "<UNITMODE>")])
+
 (define_insn "smin<mode>3"
   [(set (match_operand:ANYF            0 "register_operand" "=f")
 	(smin:ANYF (match_operand:ANYF 1 "register_operand" " f")
@@ -2315,7 +2342,7 @@
 	  QUIET_COMPARISON))
     (clobber (match_scratch:X 3 "=&r"))]
   "TARGET_HARD_FLOAT && ! HONOR_SNANS (<ANYF:MODE>mode)"
-  "frflags\t%3\n\tf<quiet_pattern>.<fmt>\t%0,%1,%2\n\tfsflags %3"
+  "frflags\t%3\n\tf<quiet_pattern>.<fmt>\t%0,%1,%2\n\tfsflags\t%3"
   [(set_attr "type" "fcmp")
    (set_attr "mode" "<UNITMODE>")
    (set (attr "length") (const_int 12))])
@@ -2328,7 +2355,7 @@
 	  QUIET_COMPARISON))
     (clobber (match_scratch:X 3 "=&r"))]
   "TARGET_HARD_FLOAT && HONOR_SNANS (<ANYF:MODE>mode)"
-  "frflags\t%3\n\tf<quiet_pattern>.<fmt>\t%0,%1,%2\n\tfsflags %3\n\tfeq.<fmt>\tzero,%1,%2"
+  "frflags\t%3\n\tf<quiet_pattern>.<fmt>\t%0,%1,%2\n\tfsflags\t%3\n\tfeq.<fmt>\tzero,%1,%2"
   [(set_attr "type" "fcmp")
    (set_attr "mode" "<UNITMODE>")
    (set (attr "length") (const_int 16))])
@@ -2863,6 +2890,56 @@
   "<load>\t%3, %1\;<load>\t%0, %2\;xor\t%0, %3, %0\;li\t%3, 0"
   [(set_attr "length" "12")])
 
+(define_insn "riscv_clean_<mode>"
+  [(unspec_volatile:X [(match_operand:X 0 "register_operand" "r")]
+    UNSPECV_CLEAN)]
+  "TARGET_ZICBOM"
+  "cbo.clean\t%a0"
+)
+
+(define_insn "riscv_flush_<mode>"
+  [(unspec_volatile:X [(match_operand:X 0 "register_operand" "r")]
+    UNSPECV_FLUSH)]
+  "TARGET_ZICBOM"
+  "cbo.flush\t%a0"
+)
+
+(define_insn "riscv_inval_<mode>"
+  [(unspec_volatile:X [(match_operand:X 0 "register_operand" "r")]
+    UNSPECV_INVAL)]
+  "TARGET_ZICBOM"
+  "cbo.inval\t%a0"
+)
+
+(define_insn "riscv_zero_<mode>"
+  [(unspec_volatile:X [(match_operand:X 0 "register_operand" "r")]
+    UNSPECV_ZERO)]
+  "TARGET_ZICBOZ"
+  "cbo.zero\t%a0"
+)
+
+(define_insn "prefetch"
+  [(prefetch (match_operand 0 "address_operand" "p")
+             (match_operand 1 "imm5_operand" "i")
+             (match_operand 2 "const_int_operand" "n"))]
+  "TARGET_ZICBOP"
+{
+  switch (INTVAL (operands[1]))
+  {
+    case 0: return "prefetch.r\t%a0";
+    case 1: return "prefetch.w\t%a0";
+    default: gcc_unreachable ();
+  }
+})
+
+(define_insn "riscv_prefetchi_<mode>"
+  [(unspec_volatile:X [(match_operand:X 0 "address_operand" "p")
+              (match_operand:X 1 "imm5_operand" "i")]
+              UNSPECV_PREI)]
+  "TARGET_ZICBOP"
+  "prefetch.i\t%a0"
+)
+
 (include "bitmanip.md")
 (include "sync.md")
 (include "peephole.md")
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 9fffc08..9e9fe6d 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -92,7 +92,7 @@ Target RejectNegative Joined Var(riscv_cpu_string)
 -mcpu=PROCESSOR	Use architecture of and optimize the output for PROCESSOR.
 
 msmall-data-limit=
-Target Joined Separate UInteger Var(g_switch_value) Init(8)
+Target Joined UInteger Var(g_switch_value) Init(8)
 -msmall-data-limit=N	Put global and static data smaller than <number> bytes into a special section (on some targets).
 
 msave-restore
@@ -204,11 +204,14 @@ TargetVariable
 int riscv_zk_subext
 
 TargetVariable
-int riscv_vector_eew_flags
+int riscv_vector_elen_flags
 
 TargetVariable
 int riscv_zvl_flags
 
+TargetVariable
+int riscv_zicmo_subext
+
 Enum
 Name(isa_spec_class) Type(enum riscv_isa_spec_class)
 Supported ISA specs (for use with the -misa-spec= option):
diff --git a/gcc/config/rs6000/constraints.md b/gcc/config/rs6000/constraints.md
index 9eb36fe..5a44a92 100644
--- a/gcc/config/rs6000/constraints.md
+++ b/gcc/config/rs6000/constraints.md
@@ -29,7 +29,7 @@
   "A base register.  Like @code{r}, but @code{r0} is not allowed, so
    @code{r1}@dots{}@code{r31}.")
 
-(define_register_constraint "f" "rs6000_constraints[RS6000_CONSTRAINT_f]"
+(define_register_constraint "f" "rs6000_constraints[RS6000_CONSTRAINT_d]"
   "A floating point register (FPR), @code{f0}@dots{}@code{f31}.")
 
 (define_register_constraint "d" "rs6000_constraints[RS6000_CONSTRAINT_d]"
@@ -37,7 +37,7 @@
    historically @code{f} was for single-precision and @code{d} was for
    double-precision floating point.")
 
-(define_register_constraint "v" "ALTIVEC_REGS"
+(define_register_constraint "v" "rs6000_constraints[RS6000_CONSTRAINT_v]"
   "An Altivec vector register (VR), @code{v0}@dots{}@code{v31}.")
 
 (define_register_constraint "wa" "rs6000_constraints[RS6000_CONSTRAINT_wa]"
@@ -263,7 +263,7 @@
        (match_test "REG_P (XEXP (op, 0))")))
 
 (define_memory_constraint "Y"
-  "@internal A memory operand for a DQ-form instruction."
+  "@internal A memory operand for a DS-form instruction."
   (and (match_code "mem")
        (match_test "mem_operand_gpr (op, mode)")))
 
diff --git a/gcc/config/rs6000/driver-rs6000.cc b/gcc/config/rs6000/driver-rs6000.cc
index ec890e0..b871f0a 100644
--- a/gcc/config/rs6000/driver-rs6000.cc
+++ b/gcc/config/rs6000/driver-rs6000.cc
@@ -599,7 +599,7 @@ host_detect_local_cpu (int argc, const char **argv)
 
   if (assembler)
     {
-      for (i = 0; i < sizeof (asm_names) / sizeof (asm_names[0]); i++)
+      for (i = 0; i < ARRAY_SIZE (asm_names); i++)
 	{
 	  if (!asm_names[i].cpu || !strcmp (asm_names[i].cpu, cpu))
 	    return asm_names[i].asm_sw;
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 907c9d6..a183b6a 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -490,50 +490,50 @@
   [(set_attr "type" "mma")])
 
 (define_insn "mma_<vv>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")]
 		    MMA_VV))]
   "TARGET_MMA"
   "<vv> %A0,%x1,%x2"
   [(set_attr "type" "mma")])
 
 (define_insn "mma_<avv>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 3 "vsx_register_operand" "wa")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 3 "vsx_register_operand" "v,?wa")]
 		    MMA_AVV))]
   "TARGET_MMA"
   "<avv> %A0,%x2,%x3"
   [(set_attr "type" "mma")])
 
 (define_insn "mma_<pv>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:OO 1 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:OO 1 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")]
 		    MMA_PV))]
   "TARGET_MMA"
   "<pv> %A0,%x1,%x2"
   [(set_attr "type" "mma")])
 
 (define_insn "mma_<apv>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0")
-		    (match_operand:OO 2 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 3 "vsx_register_operand" "wa")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0")
+		    (match_operand:OO 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 3 "vsx_register_operand" "v,?wa")]
 		    MMA_APV))]
   "TARGET_MMA"
   "<apv> %A0,%x2,%x3"
   [(set_attr "type" "mma")])
 
 (define_insn "mma_<vvi4i4i8>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")
-		    (match_operand:SI 3 "const_0_to_15_operand" "n")
-		    (match_operand:SI 4 "const_0_to_15_operand" "n")
-		    (match_operand:SI 5 "u8bit_cint_operand" "n")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:SI 3 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 4 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 5 "u8bit_cint_operand" "n,n")]
 		    MMA_VVI4I4I8))]
   "TARGET_MMA"
   "<vvi4i4i8> %A0,%x1,%x2,%3,%4,%5"
@@ -541,13 +541,13 @@
    (set_attr "prefixed" "yes")])
 
 (define_insn "mma_<avvi4i4i8>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 3 "vsx_register_operand" "wa")
-		    (match_operand:SI 4 "const_0_to_15_operand" "n")
-		    (match_operand:SI 5 "const_0_to_15_operand" "n")
-		    (match_operand:SI 6 "u8bit_cint_operand" "n")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 3 "vsx_register_operand" "v,?wa")
+		    (match_operand:SI 4 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 5 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 6 "u8bit_cint_operand" "n,n")]
 		    MMA_AVVI4I4I8))]
   "TARGET_MMA"
   "<avvi4i4i8> %A0,%x2,%x3,%4,%5,%6"
@@ -555,12 +555,12 @@
    (set_attr "prefixed" "yes")])
 
 (define_insn "mma_<vvi4i4i2>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")
-		    (match_operand:SI 3 "const_0_to_15_operand" "n")
-		    (match_operand:SI 4 "const_0_to_15_operand" "n")
-		    (match_operand:SI 5 "const_0_to_3_operand" "n")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:SI 3 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 4 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 5 "const_0_to_3_operand" "n,n")]
 		    MMA_VVI4I4I2))]
   "TARGET_MMA"
   "<vvi4i4i2> %A0,%x1,%x2,%3,%4,%5"
@@ -568,13 +568,13 @@
    (set_attr "prefixed" "yes")])
 
 (define_insn "mma_<avvi4i4i2>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 3 "vsx_register_operand" "wa")
-		    (match_operand:SI 4 "const_0_to_15_operand" "n")
-		    (match_operand:SI 5 "const_0_to_15_operand" "n")
-		    (match_operand:SI 6 "const_0_to_3_operand" "n")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 3 "vsx_register_operand" "v,?wa")
+		    (match_operand:SI 4 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 5 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 6 "const_0_to_3_operand" "n,n")]
 		    MMA_AVVI4I4I2))]
   "TARGET_MMA"
   "<avvi4i4i2> %A0,%x2,%x3,%4,%5,%6"
@@ -582,11 +582,11 @@
    (set_attr "prefixed" "yes")])
 
 (define_insn "mma_<vvi4i4>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")
-		    (match_operand:SI 3 "const_0_to_15_operand" "n")
-		    (match_operand:SI 4 "const_0_to_15_operand" "n")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:SI 3 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 4 "const_0_to_15_operand" "n,n")]
 		    MMA_VVI4I4))]
   "TARGET_MMA"
   "<vvi4i4> %A0,%x1,%x2,%3,%4"
@@ -594,12 +594,12 @@
    (set_attr "prefixed" "yes")])
 
 (define_insn "mma_<avvi4i4>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 3 "vsx_register_operand" "wa")
-		    (match_operand:SI 4 "const_0_to_15_operand" "n")
-		    (match_operand:SI 5 "const_0_to_15_operand" "n")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 3 "vsx_register_operand" "v,?wa")
+		    (match_operand:SI 4 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 5 "const_0_to_15_operand" "n,n")]
 		    MMA_AVVI4I4))]
   "TARGET_MMA"
   "<avvi4i4> %A0,%x2,%x3,%4,%5"
@@ -607,11 +607,11 @@
    (set_attr "prefixed" "yes")])
 
 (define_insn "mma_<pvi4i2>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:OO 1 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")
-		    (match_operand:SI 3 "const_0_to_15_operand" "n")
-		    (match_operand:SI 4 "const_0_to_3_operand" "n")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:OO 1 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:SI 3 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 4 "const_0_to_3_operand" "n,n")]
 		    MMA_PVI4I2))]
   "TARGET_MMA"
   "<pvi4i2> %A0,%x1,%x2,%3,%4"
@@ -619,12 +619,12 @@
    (set_attr "prefixed" "yes")])
 
 (define_insn "mma_<apvi4i2>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0")
-		    (match_operand:OO 2 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 3 "vsx_register_operand" "wa")
-		    (match_operand:SI 4 "const_0_to_15_operand" "n")
-		    (match_operand:SI 5 "const_0_to_3_operand" "n")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0")
+		    (match_operand:OO 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 3 "vsx_register_operand" "v,?wa")
+		    (match_operand:SI 4 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 5 "const_0_to_3_operand" "n,n")]
 		    MMA_APVI4I2))]
   "TARGET_MMA"
   "<apvi4i2> %A0,%x2,%x3,%4,%5"
@@ -632,12 +632,12 @@
    (set_attr "prefixed" "yes")])
 
 (define_insn "mma_<vvi4i4i4>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")
-		    (match_operand:SI 3 "const_0_to_15_operand" "n")
-		    (match_operand:SI 4 "const_0_to_15_operand" "n")
-		    (match_operand:SI 5 "const_0_to_15_operand" "n")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:SI 3 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 4 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 5 "const_0_to_15_operand" "n,n")]
 		    MMA_VVI4I4I4))]
   "TARGET_MMA"
   "<vvi4i4i4> %A0,%x1,%x2,%3,%4,%5"
@@ -645,13 +645,13 @@
    (set_attr "prefixed" "yes")])
 
 (define_insn "mma_<avvi4i4i4>"
-  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0")
-		    (match_operand:V16QI 2 "vsx_register_operand" "wa")
-		    (match_operand:V16QI 3 "vsx_register_operand" "wa")
-		    (match_operand:SI 4 "const_0_to_15_operand" "n")
-		    (match_operand:SI 5 "const_0_to_15_operand" "n")
-		    (match_operand:SI 6 "const_0_to_15_operand" "n")]
+  [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d")
+	(unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0")
+		    (match_operand:V16QI 2 "vsx_register_operand" "v,?wa")
+		    (match_operand:V16QI 3 "vsx_register_operand" "v,?wa")
+		    (match_operand:SI 4 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 5 "const_0_to_15_operand" "n,n")
+		    (match_operand:SI 6 "const_0_to_15_operand" "n,n")]
 		    MMA_AVVI4I4I4))]
   "TARGET_MMA"
   "<avvi4i4i4> %A0,%x2,%x3,%4,%5,%6"
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 566b85b..b1fcc69 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -1277,10 +1277,15 @@
 (define_predicate "mma_disassemble_output_operand"
   (match_code "reg,subreg,mem")
 {
+  if (MEM_P (op))
+    {
+      rtx  addr = XEXP (op, 0);
+      return indexed_or_indirect_address (addr, mode)
+	     || quad_address_p (addr, mode, false);
+    }
+
   if (SUBREG_P (op))
     op = SUBREG_REG (op);
-  if (!REG_P (op))
-    return true;
 
   return vsx_register_operand (op, mode);
 })
diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc
index e925ba9..2819773 100644
--- a/gcc/config/rs6000/rs6000-builtin.cc
+++ b/gcc/config/rs6000/rs6000-builtin.cc
@@ -45,8 +45,8 @@
 #include "expr.h"
 #include "langhooks.h"
 #include "gimplify.h"
-#include "gimple-fold.h"
 #include "gimple-iterator.h"
+#include "gimple-fold.h"
 #include "ssa.h"
 #include "tree-ssa-propagate.h"
 #include "builtins.h"
@@ -2000,16 +2000,14 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
     case RS6000_BIF_VCMPEQUH:
     case RS6000_BIF_VCMPEQUW:
     case RS6000_BIF_VCMPEQUD:
-    /* We deliberately omit RS6000_BIF_VCMPEQUT for now, because gimple
-       folding produces worse code for 128-bit compares.  */
+    case RS6000_BIF_VCMPEQUT:
       fold_compare_helper (gsi, EQ_EXPR, stmt);
       return true;
 
     case RS6000_BIF_VCMPNEB:
     case RS6000_BIF_VCMPNEH:
     case RS6000_BIF_VCMPNEW:
-    /* We deliberately omit RS6000_BIF_VCMPNET for now, because gimple
-       folding produces worse code for 128-bit compares.  */
+    case RS6000_BIF_VCMPNET:
       fold_compare_helper (gsi, NE_EXPR, stmt);
       return true;
 
@@ -2021,9 +2019,8 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
     case RS6000_BIF_CMPGE_U4SI:
     case RS6000_BIF_CMPGE_2DI:
     case RS6000_BIF_CMPGE_U2DI:
-    /* We deliberately omit RS6000_BIF_CMPGE_1TI and RS6000_BIF_CMPGE_U1TI
-       for now, because gimple folding produces worse code for 128-bit
-       compares.  */
+    case RS6000_BIF_CMPGE_1TI:
+    case RS6000_BIF_CMPGE_U1TI:
       fold_compare_helper (gsi, GE_EXPR, stmt);
       return true;
 
@@ -2035,9 +2032,8 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
     case RS6000_BIF_VCMPGTUW:
     case RS6000_BIF_VCMPGTUD:
     case RS6000_BIF_VCMPGTSD:
-    /* We deliberately omit RS6000_BIF_VCMPGTUT and RS6000_BIF_VCMPGTST
-       for now, because gimple folding produces worse code for 128-bit
-       compares.  */
+    case RS6000_BIF_VCMPGTUT:
+    case RS6000_BIF_VCMPGTST:
       fold_compare_helper (gsi, GT_EXPR, stmt);
       return true;
 
@@ -2049,9 +2045,8 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
     case RS6000_BIF_CMPLE_U4SI:
     case RS6000_BIF_CMPLE_2DI:
     case RS6000_BIF_CMPLE_U2DI:
-    /* We deliberately omit RS6000_BIF_CMPLE_1TI and RS6000_BIF_CMPLE_U1TI
-       for now, because gimple folding produces worse code for 128-bit
-       compares.  */
+    case RS6000_BIF_CMPLE_1TI:
+    case RS6000_BIF_CMPLE_U1TI:
       fold_compare_helper (gsi, LE_EXPR, stmt);
       return true;
 
diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def
index 221bbc7..f76f547 100644
--- a/gcc/config/rs6000/rs6000-builtins.def
+++ b/gcc/config/rs6000/rs6000-builtins.def
@@ -215,7 +215,7 @@
 ; processors, this builtin automatically falls back to mffs on older
 ; platforms.  Thus it appears here in the [always] stanza.
   double __builtin_mffsl ();
-    MFFSL rs6000_mffsl {}
+    MFFSL rs6000_mffsl {nosoft}
 
 ; This is redundant with __builtin_pack_ibm128, as it requires long
 ; double to be __ibm128.  Should probably be deprecated.
@@ -226,10 +226,10 @@
     MFTB rs6000_mftb_di {32bit}
 
   void __builtin_mtfsb0 (const int<5>);
-    MTFSB0 rs6000_mtfsb0 {}
+    MTFSB0 rs6000_mtfsb0 {nosoft}
 
   void __builtin_mtfsb1 (const int<5>);
-    MTFSB1 rs6000_mtfsb1 {}
+    MTFSB1 rs6000_mtfsb1 {nosoft}
 
   void __builtin_mtfsf (const int<8>, double);
     MTFSF rs6000_mtfsf {}
@@ -238,7 +238,7 @@
     PACK_IF packif {ibm128}
 
   void __builtin_set_fpscr_rn (const int[0,3]);
-    SET_FPSCR_RN rs6000_set_fpscr_rn {}
+    SET_FPSCR_RN rs6000_set_fpscr_rn {nosoft}
 
   const double __builtin_unpack_ibm128 (__ibm128, const int<1>);
     UNPACK_IF unpackif {ibm128}
@@ -410,6 +410,18 @@
   const vss __builtin_altivec_nabs_v8hi (vss);
     NABS_V8HI nabsv8hi2 {}
 
+  const vsc __builtin_altivec_neg_v16qi (vsc);
+    NEG_V16QI negv16qi2 {}
+
+  const vf __builtin_altivec_neg_v4sf (vf);
+    NEG_V4SF negv4sf2 {}
+
+  const vsi __builtin_altivec_neg_v4si (vsi);
+    NEG_V4SI negv4si2 {}
+
+  const vss __builtin_altivec_neg_v8hi (vss);
+    NEG_V8HI negv8hi2 {}
+
   void __builtin_altivec_stvebx (vsc, signed long, void *);
     STVEBX altivec_stvebx {stvec}
 
@@ -1175,6 +1187,9 @@
   const vsll __builtin_altivec_nabs_v2di (vsll);
     NABS_V2DI nabsv2di2 {}
 
+  const vd __builtin_altivec_neg_v2df (vd);
+    NEG_V2DF negv2df2 {}
+
   void __builtin_altivec_stvx_v2df (vd, signed long, void *);
     STVX_V2DF altivec_stvx_v2df {stvec}
 
@@ -1409,9 +1424,10 @@
   pure vsc __builtin_vsx_ld_elemrev_v16qi (signed long, const void *);
     LD_ELEMREV_V16QI vsx_ld_elemrev_v16qi {ldvec,endian}
 
-; TODO: There is apparent intent in rs6000-builtin.def to have
-; RS6000_BTC_SPECIAL processing for LXSDX, LXVDSX, and STXSDX, but there are
-; no def_builtin calls for any of them.  At some point, we may want to add a
+; TODO: There was apparent intent in the rs6000-builtin.def to
+; have SPECIAL processing for LXSDX, LXVDSX, and STXSDX, but there are
+; no def_builtin calls for any of them.  That file was removed as part
+; of the BIF rewrite, but at some point, we may want to add a
 ; set of built-ins for whichever vector types make sense for these.
 
   pure vsq __builtin_vsx_lxvd2x_v1ti (signed long, const void *);
@@ -2118,24 +2134,9 @@
   const vus __builtin_altivec_nand_v8hi_uns (vus, vus);
     NAND_V8HI_UNS nandv8hi3 {}
 
-  const vsc __builtin_altivec_neg_v16qi (vsc);
-    NEG_V16QI negv16qi2 {}
-
-  const vd __builtin_altivec_neg_v2df (vd);
-    NEG_V2DF negv2df2 {}
-
   const vsll __builtin_altivec_neg_v2di (vsll);
     NEG_V2DI negv2di2 {}
 
-  const vf __builtin_altivec_neg_v4sf (vf);
-    NEG_V4SF negv4sf2 {}
-
-  const vsi __builtin_altivec_neg_v4si (vsi);
-    NEG_V4SI negv4si2 {}
-
-  const vss __builtin_altivec_neg_v8hi (vss);
-    NEG_V8HI negv8hi2 {}
-
   const vsc __builtin_altivec_orc_v16qi (vsc, vsc);
     ORC_V16QI orcv16qi3 {}
 
@@ -2969,7 +2970,7 @@
     PACK_TD packtd {}
 
   void __builtin_set_fpscr_drn (const int[0,7]);
-    SET_FPSCR_DRN rs6000_set_fpscr_drn {}
+    SET_FPSCR_DRN rs6000_set_fpscr_drn {nosoft,no32bit}
 
   const unsigned long long __builtin_unpack_dec128 (_Decimal128, const int<1>);
     UNPACK_TD unpacktd {}
diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 3b62b49..9c8cbd7 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -178,9 +178,8 @@ rid_int128(void)
   return RID_MAX + 1;
 }
 
-/* Called to decide whether a conditional macro should be expanded.
-   Since we have exactly one such macro (i.e, 'vector'), we do not
-   need to examine the 'tok' parameter.  */
+/* Called to decide whether a conditional macro should be expanded
+   by peeking two or more tokens(_bool/_pixel/int/long/double/...).  */
 
 static cpp_hashnode *
 rs6000_macro_to_expand (cpp_reader *pfile, const cpp_token *tok)
@@ -282,7 +281,9 @@ rs6000_macro_to_expand (cpp_reader *pfile, const cpp_token *tok)
 		expand_bool_pixel = __pixel_keyword;
 	      else if (ident == C_CPP_HASHNODE (__bool_keyword))
 		expand_bool_pixel = __bool_keyword;
-	      else
+
+	      /* If there are more tokens to check.  */
+	      else if (ident)
 		{
 		  /* Try two tokens down, too.  */
 		  do
@@ -1686,6 +1687,10 @@ find_instance (bool *unsupported_builtin, ovlddata **instance,
 
   ovlddata *inst = *instance;
   gcc_assert (inst != NULL);
+  /* It is possible for an instance to require a data type that isn't
+     defined on this target, in which case inst->fntype will be NULL.  */
+  if (!inst->fntype)
+    return error_mark_node;
   tree fntype = rs6000_builtin_info[inst->bifid].fntype;
   tree parmtype0 = TREE_VALUE (TYPE_ARG_TYPES (fntype));
   tree parmtype1 = TREE_VALUE (TREE_CHAIN (TYPE_ARG_TYPES (fntype)));
@@ -1788,8 +1793,9 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
 	{
 	  if (TYPE_READONLY (TREE_TYPE (type))
 	      && !TYPE_READONLY (TREE_TYPE (decl_type)))
-	    warning (0, "passing argument %d of %qE discards const qualifier "
-		     "from pointer target type", n + 1, fndecl);
+	    warning (0, "passing argument %d of %qE discards %qs "
+		     "qualifier from pointer target type", n + 1, fndecl,
+		     "const");
 	  type = build_qualified_type (TREE_TYPE (type), 0);
 	  type = build_pointer_type (type);
 	  arg = fold_convert (type, arg);
diff --git a/gcc/config/rs6000/rs6000-call.cc b/gcc/config/rs6000/rs6000-call.cc
index f06c692..551968b 100644
--- a/gcc/config/rs6000/rs6000-call.cc
+++ b/gcc/config/rs6000/rs6000-call.cc
@@ -55,8 +55,8 @@
 #include "common/common-target.h"
 #include "langhooks.h"
 #include "gimplify.h"
-#include "gimple-fold.h"
 #include "gimple-iterator.h"
+#include "gimple-fold.h"
 #include "ssa.h"
 #include "tree-ssa-propagate.h"
 #include "builtins.h"
@@ -1111,6 +1111,12 @@ rs6000_function_arg_advance_1 (CUMULATIVE_ARGS *cum, machine_mode mode,
 	{
 	  cum->vregno += n_elts;
 
+	  /* If we are not splitting Complex IEEE128 args then account for the
+	     fact that they are passed in 2 VSX regs. */
+	  if (!targetm.calls.split_complex_arg && type
+	      && TREE_CODE (type) == COMPLEX_TYPE && elt_mode == KCmode)
+	    cum->vregno++;
+
 	  if (!TARGET_ALTIVEC)
 	    error ("cannot pass argument in vector register because"
 		   " altivec instructions are disabled, use %qs"
diff --git a/gcc/config/rs6000/rs6000-p8swap.cc b/gcc/config/rs6000/rs6000-p8swap.cc
index d301bc3..275702f 100644
--- a/gcc/config/rs6000/rs6000-p8swap.cc
+++ b/gcc/config/rs6000/rs6000-p8swap.cc
@@ -214,8 +214,9 @@ union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
       if (DF_REF_INSN_INFO (link->ref))
 	{
 	  rtx def_insn = DF_REF_INSN (link->ref);
-	  (void)unionfind_union (insn_entry + INSN_UID (insn),
-				 insn_entry + INSN_UID (def_insn));
+	  gcc_assert (NONDEBUG_INSN_P (def_insn));
+	  unionfind_union (insn_entry + INSN_UID (insn),
+			   insn_entry + INSN_UID (def_insn));
 	}
 
       link = link->next;
@@ -242,8 +243,9 @@ union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
       if (DF_REF_INSN_INFO (link->ref))
 	{
 	  rtx use_insn = DF_REF_INSN (link->ref);
-	  (void)unionfind_union (insn_entry + INSN_UID (insn),
-				 insn_entry + INSN_UID (use_insn));
+	  if (NONDEBUG_INSN_P (use_insn))
+	    unionfind_union (insn_entry + INSN_UID (insn),
+			     insn_entry + INSN_UID (use_insn));
 	}
 
       link = link->next;
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 283e830..59481d9 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -58,8 +58,8 @@
 #include "reload.h"
 #include "sched-int.h"
 #include "gimplify.h"
-#include "gimple-fold.h"
 #include "gimple-iterator.h"
+#include "gimple-fold.h"
 #include "gimple-walk.h"
 #include "ssa.h"
 #include "tree-vectorizer.h"
@@ -2305,7 +2305,6 @@ rs6000_debug_reg_global (void)
   fprintf (stderr,
 	   "\n"
 	   "d  reg_class = %s\n"
-	   "f  reg_class = %s\n"
 	   "v  reg_class = %s\n"
 	   "wa reg_class = %s\n"
 	   "we reg_class = %s\n"
@@ -2314,7 +2313,6 @@ rs6000_debug_reg_global (void)
 	   "wA reg_class = %s\n"
 	   "\n",
 	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_d]],
-	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_f]],
 	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_v]],
 	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_wa]],
 	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_we]],
@@ -2953,7 +2951,6 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
      constraints are:
 
 	d  - Register class to use with traditional DFmode instructions.
-	f  - Register class to use with traditional SFmode instructions.
 	v  - Altivec register.
 	wa - Any VSX register.
 	wc - Reserved to represent individual CR bits (used in LLVM).
@@ -2962,18 +2959,11 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
 	wx - Float register if we can do 32-bit int stores.  */
 
   if (TARGET_HARD_FLOAT)
-    {
-      rs6000_constraints[RS6000_CONSTRAINT_f] = FLOAT_REGS;	/* SFmode  */
-      rs6000_constraints[RS6000_CONSTRAINT_d] = FLOAT_REGS;	/* DFmode  */
-    }
-
-  if (TARGET_VSX)
-    rs6000_constraints[RS6000_CONSTRAINT_wa] = VSX_REGS;
-
-  /* Add conditional constraints based on various options, to allow us to
-     collapse multiple insn patterns.  */
+    rs6000_constraints[RS6000_CONSTRAINT_d] = FLOAT_REGS;
   if (TARGET_ALTIVEC)
     rs6000_constraints[RS6000_CONSTRAINT_v] = ALTIVEC_REGS;
+  if (TARGET_VSX)
+    rs6000_constraints[RS6000_CONSTRAINT_wa] = VSX_REGS;
 
   if (TARGET_POWERPC64)
     {
@@ -4151,7 +4141,10 @@ rs6000_option_override_internal (bool global_init_p)
 
   if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR))
     {
-      if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX)
+      /* Do not generate lxvp and stxvp on power10 since there are some
+	 performance issues.  */
+      if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX
+	  && rs6000_tune != PROCESSOR_POWER10)
 	rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
       else
 	rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
@@ -4345,8 +4338,8 @@ rs6000_option_override_internal (bool global_init_p)
 	    rs6000_veclib_handler = rs6000_builtin_vectorized_libmass;
 	  else
 	    {
-	      error ("unknown vectorization library ABI type (%qs) for "
-		     "%qs switch", rs6000_veclibabi_name, "-mveclibabi=");
+	      error ("unknown vectorization library ABI type in "
+		     "%<-mveclibabi=%s%>", rs6000_veclibabi_name);
 	      ret = false;
 	    }
 	}
@@ -15867,11 +15860,30 @@ rs6000_maybe_emit_maxc_minc (rtx dest, rtx op, rtx true_cond, rtx false_cond)
   rtx op1 = XEXP (op, 1);
   machine_mode compare_mode = GET_MODE (op0);
   machine_mode result_mode = GET_MODE (dest);
-  bool max_p = false;
 
   if (result_mode != compare_mode)
     return false;
 
+  /* See the comments of this function, it simply expects GE/GT/LE/LT in
+     the checks, but for the reversible equivalent UNLT/UNLE/UNGT/UNGE,
+     we need to do the reversions first to make the following checks
+     support fewer cases, like:
+
+	(a UNLT b) ? op1 : op2 =>  (a >= b) ? op2 : op1;
+	(a UNLE b) ? op1 : op2 =>  (a >  b) ? op2 : op1;
+	(a UNGT b) ? op1 : op2 =>  (a <= b) ? op2 : op1;
+	(a UNGE b) ? op1 : op2 =>  (a <  b) ? op2 : op1;
+
+     By the way, if we see these UNLT/UNLE/UNGT/UNGE it's guaranteed
+     that we have 4-way condition codes (LT/GT/EQ/UN), so we do not
+     have to check for fast-math or the like.  */
+  if (code == UNGE || code == UNGT || code == UNLE || code == UNLT)
+    {
+      code = reverse_condition_maybe_unordered (code);
+      std::swap (true_cond, false_cond);
+    }
+
+  bool max_p;
   if (code == GE || code == GT)
     max_p = true;
   else if (code == LE || code == LT)
@@ -23285,9 +23297,13 @@ rs6000_expand_vec_perm_const_1 (rtx target, rtx op0, rtx op1,
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
 
 static bool
-rs6000_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
-				 rtx op1, const vec_perm_indices &sel)
+rs6000_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+				 rtx target, rtx op0, rtx op1,
+				 const vec_perm_indices &sel)
 {
+  if (vmode != op_mode)
+    return false;
+
   bool testing_p = !target;
 
   /* AltiVec (and thus VSX) can handle arbitrary permutations.  */
@@ -25331,6 +25347,11 @@ rs6000_can_inline_p (tree caller, tree callee)
 	    }
 	}
 
+      /* Ignore -mpower8-fusion and -mpower10-fusion options for inlining
+	 purposes.  */
+      callee_isa &= ~(OPTION_MASK_P8_FUSION | OPTION_MASK_P10_FUSION);
+      explicit_isa &= ~(OPTION_MASK_P8_FUSION | OPTION_MASK_P10_FUSION);
+
       /* The callee's options must be a subset of the caller's options, i.e.
 	 a vsx function may inline an altivec function, but a no-vsx function
 	 must not inline a vsx function.  However, for those options that the
@@ -25659,11 +25680,20 @@ rs6000_sibcall_aix (rtx value, rtx func_desc, rtx tlsarg, rtx cookie)
   rtx r12 = NULL_RTX;
   rtx func_addr = func_desc;
 
-  gcc_assert (INTVAL (cookie) == 0);
-
   if (global_tlsarg)
     tlsarg = global_tlsarg;
 
+  /* Handle longcall attributes.  */
+  if (INTVAL (cookie) & CALL_LONG && SYMBOL_REF_P (func_desc))
+    {
+      /* PCREL can do a sibling call to a longcall function
+	 because we don't need to restore the TOC register.  */
+      gcc_assert (rs6000_pcrel_p ());
+      func_desc = rs6000_longcall_ref (func_desc, tlsarg);
+    }
+  else
+    gcc_assert (INTVAL (cookie) == 0);
+
   /* For ELFv2, r12 and CTR need to hold the function address
      for an indirect call.  */
   if (GET_CODE (func_desc) != SYMBOL_REF && DEFAULT_ABI == ABI_ELFv2)
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 523256a5..3b8941a 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -1240,8 +1240,7 @@ extern enum reg_class rs6000_regno_regclass[FIRST_PSEUDO_REGISTER];
 /* Register classes for various constraints that are based on the target
    switches.  */
 enum r6000_reg_class_enum {
-  RS6000_CONSTRAINT_d,		/* fpr registers for double values */
-  RS6000_CONSTRAINT_f,		/* fpr registers for single values */
+  RS6000_CONSTRAINT_d,		/* FPR registers */
   RS6000_CONSTRAINT_v,		/* Altivec registers */
   RS6000_CONSTRAINT_wa,		/* Any VSX register */
   RS6000_CONSTRAINT_we,		/* VSX register if ISA 3.0 vector. */
@@ -2249,54 +2248,6 @@ extern char rs6000_reg_names[][8];	/* register names (0 vs. %r0).  */
 /* General flags.  */
 extern int frame_pointer_needed;
 
-/* Classification of the builtin functions as to which switches enable the
-   builtin, and what attributes it should have.  We used to use the target
-   flags macros, but we've run out of bits, so we now map the options into new
-   settings used here.  */
-
-/* Builtin operand count.  */
-#define RS6000_BTC_UNARY	0x00000001	/* normal unary function.  */
-#define RS6000_BTC_BINARY	0x00000002	/* normal binary function.  */
-#define RS6000_BTC_TERNARY	0x00000003	/* normal ternary function.  */
-#define RS6000_BTC_QUATERNARY	0x00000004	/* normal quaternary
-						   function. */
-#define RS6000_BTC_QUINARY	0x00000005	/* normal quinary function.  */
-#define RS6000_BTC_SENARY	0x00000006	/* normal senary function.  */
-#define RS6000_BTC_OPND_MASK	0x00000007	/* Mask to isolate operands. */
-
-/* Builtin attributes.  */
-#define RS6000_BTC_SPECIAL	0x00000000	/* Special function.  */
-#define RS6000_BTC_PREDICATE	0x00000008	/* predicate function.  */
-#define RS6000_BTC_ABS		0x00000010	/* Altivec/VSX ABS
-						   function.  */
-#define RS6000_BTC_DST		0x00000020	/* Altivec DST function.  */
-
-#define RS6000_BTC_TYPE_MASK	0x0000003f	/* Mask to isolate types */
-
-#define RS6000_BTC_MISC		0x00000000	/* No special attributes.  */
-#define RS6000_BTC_CONST	0x00000100	/* Neither uses, nor
-						   modifies global state.  */
-#define RS6000_BTC_PURE		0x00000200	/* reads global
-						   state/mem and does
-						   not modify global state.  */
-#define RS6000_BTC_FP		0x00000400	/* depends on rounding mode.  */
-#define RS6000_BTC_QUAD		0x00000800	/* Uses a register quad.  */
-#define RS6000_BTC_PAIR		0x00001000	/* Uses a register pair.  */
-#define RS6000_BTC_QUADPAIR	0x00001800	/* Uses a quad and a pair.  */
-#define RS6000_BTC_ATTR_MASK	0x00001f00	/* Mask of the attributes.  */
-
-/* Miscellaneous information.  */
-#define RS6000_BTC_SPR		0x01000000	/* function references SPRs.  */
-#define RS6000_BTC_VOID		0x02000000	/* function has no return value.  */
-#define RS6000_BTC_CR		0x04000000	/* function references a CR.  */
-#define RS6000_BTC_OVERLOADED	0x08000000	/* function is overloaded.  */
-#define RS6000_BTC_GIMPLE	0x10000000	/* function should be expanded
-						   into gimple.  */
-#define RS6000_BTC_MISC_MASK	0x1f000000	/* Mask of the misc info.  */
-
-/* Convenience macros to document the instruction type.  */
-#define RS6000_BTC_MEM		RS6000_BTC_MISC	/* load/store touches mem.  */
-#define RS6000_BTC_SAT		RS6000_BTC_MISC	/* saturate sets VSCR.  */
 
 /* Builtin targets.  For now, we reuse the masks for those options that are in
    target flags, and pick a random bit for ldbl128, which isn't in
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index fdfbc65..c55ee7e 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -619,18 +619,6 @@
 (define_mode_iterator SIGNBIT [(KF "FLOAT128_VECTOR_P (KFmode)")
 			       (TF "FLOAT128_VECTOR_P (TFmode)")])
 
-; Iterator for ISA 3.0 supported floating point types
-(define_mode_iterator FP_ISA3 [SF DF])
-
-; SF/DF constraint for arithmetic on traditional floating point registers
-(define_mode_attr Ff		[(SF "f") (DF "d") (DI "d")])
-
-; SF/DF constraint for arithmetic on VSX registers using instructions added in
-; ISA 2.06 (power7).  This includes instructions that normally target DF mode,
-; but are used on SFmode, since internally SFmode values are kept in the DFmode
-; format.
-(define_mode_attr Fv		[(SF "wa") (DF "wa") (DI "wa")])
-
 ; Which isa is needed for those float instructions?
 (define_mode_attr Fisa		[(SF "p8v")  (DF "*") (DI "*")])
 
@@ -835,8 +823,8 @@
 ;; complex forms.  Basic data transfer is done later.
 
 (define_insn "zero_extendqi<mode>2"
-  [(set (match_operand:EXTQI 0 "gpc_reg_operand" "=r,r,^wa,^v")
-	(zero_extend:EXTQI (match_operand:QI 1 "reg_or_mem_operand" "m,r,Z,v")))]
+  [(set (match_operand:EXTQI 0 "gpc_reg_operand" "=r,r,wa,^v")
+	(zero_extend:EXTQI (match_operand:QI 1 "reg_or_mem_operand" "m,r,?Z,v")))]
   ""
   "@
    lbz%U1%X1 %0,%1
@@ -889,8 +877,8 @@
 
 
 (define_insn "zero_extendhi<mode>2"
-  [(set (match_operand:EXTHI 0 "gpc_reg_operand" "=r,r,^wa,^v")
-	(zero_extend:EXTHI (match_operand:HI 1 "reg_or_mem_operand" "m,r,Z,v")))]
+  [(set (match_operand:EXTHI 0 "gpc_reg_operand" "=r,r,wa,^v")
+	(zero_extend:EXTHI (match_operand:HI 1 "reg_or_mem_operand" "m,r,?Z,v")))]
   ""
   "@
    lhz%U1%X1 %0,%1
@@ -944,7 +932,7 @@
 
 (define_insn "zero_extendsi<mode>2"
   [(set (match_operand:EXTSI 0 "gpc_reg_operand" "=r,r,d,wa,wa,r,wa")
-	(zero_extend:EXTSI (match_operand:SI 1 "reg_or_mem_operand" "m,r,Z,Z,r,wa,wa")))]
+	(zero_extend:EXTSI (match_operand:SI 1 "reg_or_mem_operand" "m,r,?Z,?Z,r,wa,wa")))]
   ""
   "@
    lwz%U1%X1 %0,%1
@@ -2353,6 +2341,19 @@
   "subfe %0,%0,%0"
   [(set_attr "type" "add")])
 
+(define_insn_and_split "*subfsi3_carry_in_xx_64"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(sign_extend:DI (plus:SI (reg:SI CA_REGNO)
+				 (const_int -1))))]
+  "TARGET_POWERPC64"
+  "#"
+  "&&1"
+  [(parallel [(set (match_dup 0)
+		   (plus:DI (reg:DI CA_REGNO)
+			    (const_int -1)))
+	      (clobber (reg:DI CA_REGNO))])]
+  ""
+)
 
 (define_insn "@neg<mode>2"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
@@ -2837,8 +2838,8 @@
       emit_insn (gen_bswapsi2 (dest_32, word2));
     }
 
-  emit_insn (gen_ashldi3 (op3, op3, GEN_INT (32)));
-  emit_insn (gen_iordi3 (dest, dest, op3));
+  emit_insn (gen_rotldi3_insert_3 (dest, op3, GEN_INT (32), dest,
+				   GEN_INT (0xffffffff)));
   DONE;
 })
 
@@ -2923,10 +2924,10 @@
   rtx op3_si  = simplify_gen_subreg (SImode, op3, DImode, lo_off);
 
   emit_insn (gen_lshrdi3 (op2, src, GEN_INT (32)));
-  emit_insn (gen_bswapsi2 (dest_si, src_si));
-  emit_insn (gen_bswapsi2 (op3_si, op2_si));
-  emit_insn (gen_ashldi3 (dest, dest, GEN_INT (32)));
-  emit_insn (gen_iordi3 (dest, dest, op3));
+  emit_insn (gen_bswapsi2 (op3_si, src_si));
+  emit_insn (gen_bswapsi2 (dest_si, op2_si));
+  emit_insn (gen_rotldi3_insert_3 (dest, op3, GEN_INT (32), dest,
+				   GEN_INT (0xffffffff)));
   DONE;
 })
 
@@ -4871,8 +4872,8 @@
   "")
 
 (define_insn "*abs<mode>2_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(abs:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(abs:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "d,wa")))]
   "TARGET_HARD_FLOAT"
   "@
    fabs %0,%1
@@ -4880,10 +4881,10 @@
   [(set_attr "type" "fpsimple")])
 
 (define_insn "*nabs<mode>2_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
 	(neg:SFDF
 	 (abs:SFDF
-	  (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>"))))]
+	  (match_operand:SFDF 1 "gpc_reg_operand" "d,wa"))))]
   "TARGET_HARD_FLOAT"
   "@
    fnabs %0,%1
@@ -4897,8 +4898,8 @@
   "")
 
 (define_insn "*neg<mode>2_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(neg:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(neg:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "d,wa")))]
   "TARGET_HARD_FLOAT"
   "@
    fneg %0,%1
@@ -4913,9 +4914,9 @@
   "")
 
 (define_insn "*add<mode>3_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa")
-	(plus:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,wa")
-		   (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,wa")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(plus:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "%d,wa")
+		   (match_operand:SFDF 2 "gpc_reg_operand" "d,wa")))]
   "TARGET_HARD_FLOAT"
   "@
    fadd<s> %0,%1,%2
@@ -4931,9 +4932,9 @@
   "")
 
 (define_insn "*sub<mode>3_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa")
-	(minus:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,wa")
-		    (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,wa")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(minus:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "d,wa")
+		    (match_operand:SFDF 2 "gpc_reg_operand" "d,wa")))]
   "TARGET_HARD_FLOAT"
   "@
    fsub<s> %0,%1,%2
@@ -4949,9 +4950,9 @@
   "")
 
 (define_insn "*mul<mode>3_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa")
-	(mult:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,wa")
-		   (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,wa")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(mult:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "%d,wa")
+		   (match_operand:SFDF 2 "gpc_reg_operand" "d,wa")))]
   "TARGET_HARD_FLOAT"
   "@
    fmul<s> %0,%1,%2
@@ -4975,9 +4976,9 @@
 })
 
 (define_insn "*div<mode>3_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa")
-	(div:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,wa")
-		  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,wa")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(div:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "d,wa")
+		  (match_operand:SFDF 2 "gpc_reg_operand" "d,wa")))]
   "TARGET_HARD_FLOAT"
   "@
    fdiv<s> %0,%1,%2
@@ -4986,8 +4987,8 @@
    (set_attr "isa" "*,<Fisa>")])
 
 (define_insn "*sqrt<mode>2_internal"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa")
-	(sqrt:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,wa")))]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(sqrt:SFDF (match_operand:SFDF 1 "gpc_reg_operand" "d,wa")))]
   "TARGET_HARD_FLOAT && TARGET_PPC_GPOPT"
   "@
    fsqrt<s> %0,%1
@@ -5014,8 +5015,8 @@
 
 ;; Floating point reciprocal approximation
 (define_insn "fre<sd>"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa")
-	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,wa")]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "d,wa")]
 		     UNSPEC_FRES))]
   "TARGET_<FFRE>"
   "@
@@ -5061,8 +5062,8 @@
  })
 
 (define_insn "*rsqrt<mode>2"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa")
-	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,wa")]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "d,wa")]
 		     UNSPEC_RSQRT))]
   "RS6000_RECIP_HAVE_RSQRTE_P (<MODE>mode)"
   "@
@@ -5074,8 +5075,8 @@
 ;; Floating point comparisons
 (define_insn "*cmp<mode>_fpr"
   [(set (match_operand:CCFP 0 "cc_reg_operand" "=y,y")
-	(compare:CCFP (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,wa")
-		      (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,wa")))]
+	(compare:CCFP (match_operand:SFDF 1 "gpc_reg_operand" "d,wa")
+		      (match_operand:SFDF 2 "gpc_reg_operand" "d,wa")))]
   "TARGET_HARD_FLOAT"
   "@
    fcmpu %0,%1,%2
@@ -5277,9 +5278,9 @@
 ;; Use an unspec rather providing an if-then-else in RTL, to prevent the
 ;; compiler from optimizing -0.0
 (define_insn "copysign<mode>3_fcpsgn"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")
-		      (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,<Fv>")]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "d,wa")
+		      (match_operand:SFDF 2 "gpc_reg_operand" "d,wa")]
 		     UNSPEC_COPYSIGN))]
   "TARGET_HARD_FLOAT && (TARGET_CMPB || VECTOR_UNIT_VSX_P (<MODE>mode))"
   "@
@@ -5311,9 +5312,9 @@
 })
 
 (define_insn "*s<minmax><mode>3_vsx"
-  [(set (match_operand:SFDF 0 "vsx_register_operand" "=<Fv>")
-	(fp_minmax:SFDF (match_operand:SFDF 1 "vsx_register_operand" "<Fv>")
-			(match_operand:SFDF 2 "vsx_register_operand" "<Fv>")))]
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa")
+	(fp_minmax:SFDF (match_operand:SFDF 1 "vsx_register_operand" "wa")
+			(match_operand:SFDF 2 "vsx_register_operand" "wa")))]
   "TARGET_VSX && TARGET_HARD_FLOAT"
 {
   return (TARGET_P9_MINMAX
@@ -5468,13 +5469,13 @@
   [(set_attr "type" "fp")])
 
 (define_insn_and_split "*mov<SFDF:mode><SFDF2:mode>cc_p9"
-  [(set (match_operand:SFDF 0 "vsx_register_operand" "=&<SFDF:Fv>,<SFDF:Fv>")
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=&wa,wa")
 	(if_then_else:SFDF
 	 (match_operator:CCFP 1 "fpmask_comparison_operator"
-		[(match_operand:SFDF2 2 "vsx_register_operand" "<SFDF2:Fv>,<SFDF2:Fv>")
-		 (match_operand:SFDF2 3 "vsx_register_operand" "<SFDF2:Fv>,<SFDF2:Fv>")])
-	 (match_operand:SFDF 4 "vsx_register_operand" "<SFDF:Fv>,<SFDF:Fv>")
-	 (match_operand:SFDF 5 "vsx_register_operand" "<SFDF:Fv>,<SFDF:Fv>")))
+		[(match_operand:SFDF2 2 "vsx_register_operand" "wa,wa")
+		 (match_operand:SFDF2 3 "vsx_register_operand" "wa,wa")])
+	 (match_operand:SFDF 4 "vsx_register_operand" "wa,wa")
+	 (match_operand:SFDF 5 "vsx_register_operand" "wa,wa")))
    (clobber (match_scratch:V2DI 6 "=0,&wa"))]
   "TARGET_P9_MINMAX"
   "#"
@@ -5500,13 +5501,13 @@
 
 ;; Handle inverting the fpmask comparisons.
 (define_insn_and_split "*mov<SFDF:mode><SFDF2:mode>cc_invert_p9"
-  [(set (match_operand:SFDF 0 "vsx_register_operand" "=&<SFDF:Fv>,<SFDF:Fv>")
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=&wa,wa")
 	(if_then_else:SFDF
 	 (match_operator:CCFP 1 "invert_fpmask_comparison_operator"
-		[(match_operand:SFDF2 2 "vsx_register_operand" "<SFDF2:Fv>,<SFDF2:Fv>")
-		 (match_operand:SFDF2 3 "vsx_register_operand" "<SFDF2:Fv>,<SFDF2:Fv>")])
-	 (match_operand:SFDF 4 "vsx_register_operand" "<SFDF:Fv>,<SFDF:Fv>")
-	 (match_operand:SFDF 5 "vsx_register_operand" "<SFDF:Fv>,<SFDF:Fv>")))
+		[(match_operand:SFDF2 2 "vsx_register_operand" "wa,wa")
+		 (match_operand:SFDF2 3 "vsx_register_operand" "wa,wa")])
+	 (match_operand:SFDF 4 "vsx_register_operand" "wa,wa")
+	 (match_operand:SFDF 5 "vsx_register_operand" "wa,wa")))
    (clobber (match_scratch:V2DI 6 "=0,&wa"))]
   "TARGET_P9_MINMAX"
   "#"
@@ -5539,8 +5540,8 @@
   [(set (match_operand:V2DI 0 "vsx_register_operand" "=wa")
 	(if_then_else:V2DI
 	 (match_operator:CCFP 1 "fpmask_comparison_operator"
-		[(match_operand:SFDF 2 "vsx_register_operand" "<Fv>")
-		 (match_operand:SFDF 3 "vsx_register_operand" "<Fv>")])
+		[(match_operand:SFDF 2 "vsx_register_operand" "wa")
+		 (match_operand:SFDF 3 "vsx_register_operand" "wa")])
 	 (match_operand:V2DI 4 "all_ones_constant" "")
 	 (match_operand:V2DI 5 "zero_constant" "")))]
   "TARGET_P9_MINMAX"
@@ -5548,11 +5549,11 @@
   [(set_attr "type" "fpcompare")])
 
 (define_insn "*xxsel<mode>"
-  [(set (match_operand:SFDF 0 "vsx_register_operand" "=<Fv>")
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa")
 	(if_then_else:SFDF (ne (match_operand:V2DI 1 "vsx_register_operand" "wa")
 			       (match_operand:V2DI 2 "zero_constant" ""))
-			   (match_operand:SFDF 3 "vsx_register_operand" "<Fv>")
-			   (match_operand:SFDF 4 "vsx_register_operand" "<Fv>")))]
+			   (match_operand:SFDF 3 "vsx_register_operand" "wa")
+			   (match_operand:SFDF 4 "vsx_register_operand" "wa")))]
   "TARGET_P9_MINMAX"
   "xxsel %x0,%x4,%x3,%x1"
   [(set_attr "type" "vecmove")])
@@ -5687,7 +5688,7 @@
 ; not be needed and also in case the insns are deleted as dead code.
 
 (define_insn_and_split "floatsi<mode>2_lfiwax"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,<Fv>")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
 	(float:SFDF (match_operand:SI 1 "nonimmediate_operand" "r,r")))
    (clobber (match_scratch:DI 2 "=d,wa"))]
   "TARGET_HARD_FLOAT && TARGET_LFIWAX
@@ -5726,7 +5727,7 @@
    (set_attr "type" "fpload")])
 
 (define_insn_and_split "floatsi<mode>2_lfiwax_mem"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,<Fv>")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
 	(float:SFDF
 	 (sign_extend:DI
 	  (match_operand:SI 1 "indexed_or_indirect_operand" "Z,Z"))))
@@ -5750,7 +5751,7 @@
    (set_attr "type" "fpload")])
 
 (define_insn_and_split "floatsi<SFDF:mode>2_lfiwax_<QHI:mode>_mem_zext"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,<Fv>")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
 	(float:SFDF
 	 (zero_extend:SI
 	  (match_operand:QHI 1 "indexed_or_indirect_operand" "Z,Z"))))
@@ -5784,7 +5785,7 @@
    (set_attr "isa" "*,p8v,p8v,p9v")])
 
 (define_insn_and_split "floatunssi<mode>2_lfiwzx"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,<Fv>")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
 	(unsigned_float:SFDF (match_operand:SI 1 "nonimmediate_operand" "r,r")))
    (clobber (match_scratch:DI 2 "=d,wa"))]
   "TARGET_HARD_FLOAT && TARGET_LFIWZX && <SI_CONVERT_FP>"
@@ -5822,7 +5823,7 @@
    (set_attr "type" "fpload")])
 
 (define_insn_and_split "floatunssi<mode>2_lfiwzx_mem"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,<Fv>")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
 	(unsigned_float:SFDF
 	 (zero_extend:DI
 	  (match_operand:SI 1 "indexed_or_indirect_operand" "Z,Z"))))
@@ -6008,9 +6009,9 @@
 ;; the vector registers, rather then loading up a GPR, doing a sign/zero
 ;; extension and then a direct move.
 
-(define_expand "float<QHI:mode><FP_ISA3:mode>2"
-  [(parallel [(set (match_operand:FP_ISA3 0 "vsx_register_operand")
-		   (float:FP_ISA3
+(define_expand "float<QHI:mode><SFDF:mode>2"
+  [(parallel [(set (match_operand:SFDF 0 "vsx_register_operand")
+		   (float:SFDF
 		    (match_operand:QHI 1 "input_operand")))
 	      (clobber (match_scratch:DI 2))
 	      (clobber (match_scratch:DI 3))
@@ -6021,9 +6022,9 @@
     operands[1] = rs6000_force_indexed_or_indirect_mem (operands[1]);
 })
 
-(define_insn_and_split "*float<QHI:mode><FP_ISA3:mode>2_internal"
-  [(set (match_operand:FP_ISA3 0 "vsx_register_operand" "=<Fv>,<Fv>,<Fv>")
-	(float:FP_ISA3
+(define_insn_and_split "*float<QHI:mode><SFDF:mode>2_internal"
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa,wa,wa")
+	(float:SFDF
 	 (match_operand:QHI 1 "reg_or_indexed_operand" "v,r,Z")))
    (clobber (match_scratch:DI 2 "=v,wa,v"))
    (clobber (match_scratch:DI 3 "=X,r,X"))
@@ -6057,14 +6058,14 @@
       emit_insn (gen_extend<QHI:mode>di2 (di, tmp));
     }
 
-  emit_insn (gen_floatdi<FP_ISA3:mode>2 (result, di));
+  emit_insn (gen_floatdi<SFDF:mode>2 (result, di));
   DONE;
 }
   [(set_attr "isa" "p9v,*,p9v")])
 
-(define_expand "floatuns<QHI:mode><FP_ISA3:mode>2"
-  [(parallel [(set (match_operand:FP_ISA3 0 "vsx_register_operand")
-		   (unsigned_float:FP_ISA3
+(define_expand "floatuns<QHI:mode><SFDF:mode>2"
+  [(parallel [(set (match_operand:SFDF 0 "vsx_register_operand")
+		   (unsigned_float:SFDF
 		    (match_operand:QHI 1 "input_operand")))
 	      (clobber (match_scratch:DI 2))
 	      (clobber (match_scratch:DI 3))])]
@@ -6074,9 +6075,9 @@
     operands[1] = rs6000_force_indexed_or_indirect_mem (operands[1]);
 })
 
-(define_insn_and_split "*floatuns<QHI:mode><FP_ISA3:mode>2_internal"
-  [(set (match_operand:FP_ISA3 0 "vsx_register_operand" "=<Fv>,<Fv>,<Fv>")
-	(unsigned_float:FP_ISA3
+(define_insn_and_split "*floatuns<QHI:mode><SFDF:mode>2_internal"
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa,wa,wa")
+	(unsigned_float:SFDF
 	 (match_operand:QHI 1 "reg_or_indexed_operand" "v,r,Z")))
    (clobber (match_scratch:DI 2 "=v,wa,wa"))
    (clobber (match_scratch:DI 3 "=X,r,X"))]
@@ -6103,7 +6104,7 @@
 	}
     }
 
-  emit_insn (gen_floatdi<FP_ISA3:mode>2 (result, di));
+  emit_insn (gen_floatdi<SFDF:mode>2 (result, di));
   DONE;
 }
   [(set_attr "isa" "p9v,*,p9v")])
@@ -6205,7 +6206,7 @@
 
 (define_insn "*fix_trunc<mode>di2_fctidz"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=d,wa")
-	(fix:DI (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")))]
+	(fix:DI (match_operand:SFDF 1 "gpc_reg_operand" "d,wa")))]
   "TARGET_HARD_FLOAT && TARGET_FCFID"
   "@
    fctidz %0,%1
@@ -6324,7 +6325,7 @@
 
 (define_insn "fixuns_trunc<mode>di2"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=d,wa")
-	(unsigned_fix:DI (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")))]
+	(unsigned_fix:DI (match_operand:SFDF 1 "gpc_reg_operand" "d,wa")))]
   "TARGET_HARD_FLOAT && TARGET_FCTIDUZ"
   "@
    fctiduz %0,%1
@@ -6474,7 +6475,7 @@
 (define_insn "fctiwz_<mode>"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=d,wa")
 	(unspec:DI [(fix:SI
-		     (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>"))]
+		     (match_operand:SFDF 1 "gpc_reg_operand" "d,wa"))]
 		   UNSPEC_FCTIWZ))]
   "TARGET_HARD_FLOAT"
   "@
@@ -6485,7 +6486,7 @@
 (define_insn "fctiwuz_<mode>"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=d,wa")
 	(unspec:DI [(unsigned_fix:SI
-		     (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>"))]
+		     (match_operand:SFDF 1 "gpc_reg_operand" "d,wa"))]
 		   UNSPEC_FCTIWUZ))]
   "TARGET_HARD_FLOAT && TARGET_FCTIWUZ"
   "@
@@ -6588,8 +6589,8 @@
   [(set_attr "type" "fp")])
 
 (define_insn "btrunc<mode>2"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "d,wa")]
 		     UNSPEC_FRIZ))]
   "TARGET_HARD_FLOAT && TARGET_FPRND"
   "@
@@ -6598,8 +6599,8 @@
   [(set_attr "type" "fp")])
 
 (define_insn "ceil<mode>2"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "d,wa")]
 		     UNSPEC_FRIP))]
   "TARGET_HARD_FLOAT && TARGET_FPRND"
   "@
@@ -6608,8 +6609,8 @@
   [(set_attr "type" "fp")])
 
 (define_insn "floor<mode>2"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,<Fv>")
-	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,<Fv>")]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa")
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "d,wa")]
 		     UNSPEC_FRIM))]
   "TARGET_HARD_FLOAT && TARGET_FPRND"
   "@
@@ -6627,8 +6628,8 @@
   [(set_attr "type" "fp")])
 
 (define_insn "*xsrdpi<mode>2"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Fv>")
-	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "<Fv>")]
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=wa")
+	(unspec:SFDF [(match_operand:SFDF 1 "gpc_reg_operand" "wa")]
 		     UNSPEC_XSRDPI))]
   "TARGET_HARD_FLOAT && TARGET_VSX"
   "xsrdpi %x0,%x1"
@@ -7496,7 +7497,7 @@
   [(set (match_operand:SI 0 "nonimmediate_operand"
 	  "=r,         r,
 	   r,          d,          v,
-	   m,          Z,          Z,
+	   m,          ?Z,         ?Z,
 	   r,          r,          r,          r,
 	   wa,         wa,         wa,         v,
 	   wa,         v,          v,
@@ -7504,7 +7505,7 @@
 	   r,          *h,         *h")
 	(match_operand:SI 1 "input_operand"
 	  "r,          U,
-	   m,          Z,          Z,
+	   m,          ?Z,         ?Z,
 	   r,          d,          v,
 	   I,          L,          eI,         n,
 	   wa,         O,          wM,         wB,
@@ -7785,11 +7786,11 @@
 ;;		MTVSRWZ     MF%1       MT%1       NOP
 (define_insn "*mov<mode>_internal"
   [(set (match_operand:QHI 0 "nonimmediate_operand"
-		"=r,        r,         wa,        m,         Z,         r,
+		"=r,        r,         wa,        m,         ?Z,        r,
 		 wa,        wa,        wa,        v,         ?v,        r,
 		 wa,        r,         *c*l,      *h")
 	(match_operand:QHI 1 "input_operand"
-		"r,         m,         Z,         r,         wa,        i,
+		"r,         m,         ?Z,        r,         wa,        i,
 		 wa,        O,         wM,        wB,        wS,        wa,
 		 r,         *h,        r,         0"))]
   "gpc_reg_operand (operands[0], <MODE>mode)
@@ -7973,10 +7974,10 @@
 ;;	FMR          MR         MT%0       MF%1       NOP
 (define_insn "movsd_hardfloat"
   [(set (match_operand:SD 0 "nonimmediate_operand"
-	 "=!r,       d,         m,         Z,         ?d,        ?r,
+	 "=!r,       d,         m,         ?Z,        ?d,        ?r,
 	  f,         !r,        *c*l,      !r,        *h")
 	(match_operand:SD 1 "input_operand"
-	 "m,         Z,         r,         wx,        r,         d,
+	 "m,         ?Z,        r,         wx,        r,         d,
 	  f,         r,         r,         *h,        0"))]
   "(register_operand (operands[0], SDmode)
    || register_operand (operands[1], SDmode))
@@ -10152,7 +10153,7 @@
    (set_attr "indexed" "yes,no")])
 
 (define_insn "*mov<SFDF:mode>_update1"
-  [(set (match_operand:SFDF 3 "gpc_reg_operand" "=<SFDF:Ff>,<SFDF:Ff>")
+  [(set (match_operand:SFDF 3 "gpc_reg_operand" "=d,d")
 	(mem:SFDF (plus:P (match_operand:P 1 "gpc_reg_operand" "0,0")
 			  (match_operand:P 2 "reg_or_short_operand" "r,I"))))
    (set (match_operand:P 0 "gpc_reg_operand" "=b,b")
@@ -10171,7 +10172,7 @@
 (define_insn "*mov<SFDF:mode>_update2"
   [(set (mem:SFDF (plus:P (match_operand:P 1 "gpc_reg_operand" "0,0")
 			  (match_operand:P 2 "reg_or_short_operand" "r,I")))
-	(match_operand:SFDF 3 "gpc_reg_operand" "<SFDF:Ff>,<SFDF:Ff>"))
+	(match_operand:SFDF 3 "gpc_reg_operand" "d,d"))
    (set (match_operand:P 0 "gpc_reg_operand" "=b,b")
 	(plus:P (match_dup 1) (match_dup 2)))]
   "TARGET_HARD_FLOAT && TARGET_UPDATE
@@ -14142,11 +14143,11 @@
   "")
 
 (define_insn "*fma<mode>4_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa,wa")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa,wa")
 	(fma:SFDF
-	  (match_operand:SFDF 1 "gpc_reg_operand" "%<Ff>,wa,wa")
-	  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,wa,0")
-	  (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,wa")))]
+	  (match_operand:SFDF 1 "gpc_reg_operand" "%d,wa,wa")
+	  (match_operand:SFDF 2 "gpc_reg_operand" "d,wa,0")
+	  (match_operand:SFDF 3 "gpc_reg_operand" "d,0,wa")))]
   "TARGET_HARD_FLOAT"
   "@
    fmadd<s> %0,%1,%2,%3
@@ -14166,11 +14167,11 @@
   "")
 
 (define_insn "*fms<mode>4_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa,wa")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa,wa")
 	(fma:SFDF
-	 (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,wa,wa")
-	 (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,wa,0")
-	 (neg:SFDF (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,wa"))))]
+	 (match_operand:SFDF 1 "gpc_reg_operand" "d,wa,wa")
+	 (match_operand:SFDF 2 "gpc_reg_operand" "d,wa,0")
+	 (neg:SFDF (match_operand:SFDF 3 "gpc_reg_operand" "d,0,wa"))))]
   "TARGET_HARD_FLOAT"
   "@
    fmsub<s> %0,%1,%2,%3
@@ -14213,12 +14214,12 @@
   "")
 
 (define_insn "*nfma<mode>4_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa,wa")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa,wa")
 	(neg:SFDF
 	 (fma:SFDF
-	  (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,wa,wa")
-	  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,wa,0")
-	  (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,wa"))))]
+	  (match_operand:SFDF 1 "gpc_reg_operand" "d,wa,wa")
+	  (match_operand:SFDF 2 "gpc_reg_operand" "d,wa,0")
+	  (match_operand:SFDF 3 "gpc_reg_operand" "d,0,wa"))))]
   "TARGET_HARD_FLOAT"
   "@
    fnmadd<s> %0,%1,%2,%3
@@ -14239,13 +14240,13 @@
   "")
 
 (define_insn "*nfmssf4_fpr"
-  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=<Ff>,wa,wa")
+  [(set (match_operand:SFDF 0 "gpc_reg_operand" "=d,wa,wa")
 	(neg:SFDF
 	 (fma:SFDF
-	  (match_operand:SFDF 1 "gpc_reg_operand" "<Ff>,wa,wa")
-	  (match_operand:SFDF 2 "gpc_reg_operand" "<Ff>,wa,0")
+	  (match_operand:SFDF 1 "gpc_reg_operand" "d,wa,wa")
+	  (match_operand:SFDF 2 "gpc_reg_operand" "d,wa,0")
 	  (neg:SFDF
-	   (match_operand:SFDF 3 "gpc_reg_operand" "<Ff>,0,wa")))))]
+	   (match_operand:SFDF 3 "gpc_reg_operand" "d,0,wa")))))]
   "TARGET_HARD_FLOAT"
   "@
    fnmsub<s> %0,%1,%2,%3
@@ -14580,10 +14581,10 @@
   [(set_attr "type" "fp,fpstore,mtvsr,mfvsr,store")])
 
 (define_insn_and_split "unpack<mode>_nodm"
-  [(set (match_operand:<FP128_64> 0 "nonimmediate_operand" "=d,m")
+  [(set (match_operand:<FP128_64> 0 "nonimmediate_operand" "=d,m,m")
 	(unspec:<FP128_64>
-	 [(match_operand:FMOVE128 1 "register_operand" "d,d")
-	  (match_operand:QI 2 "const_0_to_1_operand" "i,i")]
+	 [(match_operand:FMOVE128 1 "register_operand" "d,d,r")
+	  (match_operand:QI 2 "const_0_to_1_operand" "i,i,i")]
 	 UNSPEC_UNPACK_128BIT))]
   "(!TARGET_POWERPC64 || !TARGET_DIRECT_MOVE) && FLOAT128_2REG_P (<MODE>mode)"
   "#"
@@ -14600,15 +14601,28 @@
 
   operands[3] = gen_rtx_REG (<FP128_64>mode, fp_regno);
 }
-  [(set_attr "type" "fp,fpstore")])
+  [(set_attr "type" "fp,fpstore,store")])
+
+(define_expand "pack<mode>"
+  [(use (match_operand:FMOVE128 0 "register_operand"))
+   (use (match_operand:<FP128_64> 1 "register_operand"))
+   (use (match_operand:<FP128_64> 2 "register_operand"))]
+  "FLOAT128_2REG_P (<MODE>mode)"
+{
+  if (TARGET_HARD_FLOAT)
+    emit_insn (gen_pack<mode>_hard (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_pack<mode>_soft (operands[0], operands[1], operands[2]));
+  DONE;
+})
 
-(define_insn_and_split "pack<mode>"
+(define_insn_and_split "pack<mode>_hard"
   [(set (match_operand:FMOVE128 0 "register_operand" "=&d")
 	(unspec:FMOVE128
 	 [(match_operand:<FP128_64> 1 "register_operand" "d")
 	  (match_operand:<FP128_64> 2 "register_operand" "d")]
 	 UNSPEC_PACK_128BIT))]
-  "FLOAT128_2REG_P (<MODE>mode)"
+  "FLOAT128_2REG_P (<MODE>mode) && TARGET_HARD_FLOAT"
   "#"
   "&& reload_completed"
   [(set (match_dup 3) (match_dup 1))
@@ -14626,6 +14640,34 @@
   [(set_attr "type" "fp")
    (set_attr "length" "8")])
 
+(define_insn_and_split "pack<mode>_soft"
+  [(set (match_operand:FMOVE128 0 "register_operand" "=&r")
+	(unspec:FMOVE128
+	 [(match_operand:<FP128_64> 1 "register_operand" "r")
+	  (match_operand:<FP128_64> 2 "register_operand" "r")]
+	 UNSPEC_PACK_128BIT))]
+  "FLOAT128_2REG_P (<MODE>mode) && TARGET_SOFT_FLOAT"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 4) (match_dup 2))]
+{
+  unsigned dest_hi = REGNO (operands[0]);
+  unsigned dest_lo = dest_hi + (TARGET_POWERPC64 ? 1 : 2);
+
+  gcc_assert (!IN_RANGE (REGNO (operands[1]), dest_hi, dest_lo));
+  gcc_assert (!IN_RANGE (REGNO (operands[2]), dest_hi, dest_lo));
+
+  operands[3] = gen_rtx_REG (<FP128_64>mode, dest_hi);
+  operands[4] = gen_rtx_REG (<FP128_64>mode, dest_lo);
+}
+  [(set_attr "type" "integer")
+   (set (attr "length")
+	(if_then_else
+	 (match_test "TARGET_POWERPC64")
+	 (const_string "8")
+	 (const_string "16")))])
+
 (define_insn "unpack<mode>"
   [(set (match_operand:DI 0 "register_operand" "=wa,wa")
 	(unspec:DI [(match_operand:FMOVE128_VSX 1 "register_operand" "0,wa")
diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 4d0797c..a0d33d2 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -26,6 +26,9 @@
 ;; Vector int modes
 (define_mode_iterator VEC_I [V16QI V8HI V4SI V2DI])
 
+;; Vector int modes for comparison, shift and rotation
+(define_mode_iterator VEC_IC [V16QI V8HI V4SI V2DI (V1TI "TARGET_POWER10")])
+
 ;; 128-bit int modes
 (define_mode_iterator VEC_TI [V1TI TI])
 
@@ -533,10 +536,10 @@
 
 ;; For signed integer vectors comparison.
 (define_expand "vec_cmp<mode><mode>"
-  [(set (match_operand:VEC_I 0 "vint_operand")
+  [(set (match_operand:VEC_IC 0 "vint_operand")
 	(match_operator 1 "signed_or_equality_comparison_operator"
-	  [(match_operand:VEC_I 2 "vint_operand")
-	   (match_operand:VEC_I 3 "vint_operand")]))]
+	  [(match_operand:VEC_IC 2 "vint_operand")
+	   (match_operand:VEC_IC 3 "vint_operand")]))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
 {
   enum rtx_code code = GET_CODE (operands[1]);
@@ -573,10 +576,10 @@
 
 ;; For unsigned integer vectors comparison.
 (define_expand "vec_cmpu<mode><mode>"
-  [(set (match_operand:VEC_I 0 "vint_operand")
+  [(set (match_operand:VEC_IC 0 "vint_operand")
 	(match_operator 1 "unsigned_or_equality_comparison_operator"
-	  [(match_operand:VEC_I 2 "vint_operand")
-	   (match_operand:VEC_I 3 "vint_operand")]))]
+	  [(match_operand:VEC_IC 2 "vint_operand")
+	   (match_operand:VEC_IC 3 "vint_operand")]))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
 {
   enum rtx_code code = GET_CODE (operands[1]);
@@ -690,116 +693,65 @@
 
 ; >= for integer vectors: swap operands and apply not-greater-than
 (define_expand "vector_nlt<mode>"
-  [(set (match_operand:VEC_I 3 "vlogical_operand")
-	(gt:VEC_I (match_operand:VEC_I 2 "vlogical_operand")
-		  (match_operand:VEC_I 1 "vlogical_operand")))
-   (set (match_operand:VEC_I 0 "vlogical_operand")
-        (not:VEC_I (match_dup 3)))]
+  [(set (match_operand:VEC_IC 3 "vlogical_operand")
+	(gt:VEC_IC (match_operand:VEC_IC 2 "vlogical_operand")
+		   (match_operand:VEC_IC 1 "vlogical_operand")))
+   (set (match_operand:VEC_IC 0 "vlogical_operand")
+	(not:VEC_IC (match_dup 3)))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
 {
   operands[3] = gen_reg_rtx_and_attrs (operands[0]);
 })
 
-(define_expand "vector_nltv1ti"
-  [(set (match_operand:V1TI 3 "vlogical_operand")
-	(gt:V1TI (match_operand:V1TI 2 "vlogical_operand")
-		 (match_operand:V1TI 1 "vlogical_operand")))
-   (set (match_operand:V1TI 0 "vlogical_operand")
-        (not:V1TI (match_dup 3)))]
-  "TARGET_POWER10"
-{
-  operands[3] = gen_reg_rtx_and_attrs (operands[0]);
-})
-
 (define_expand "vector_gtu<mode>"
-  [(set (match_operand:VEC_I 0 "vint_operand")
-	(gtu:VEC_I (match_operand:VEC_I 1 "vint_operand")
-		   (match_operand:VEC_I 2 "vint_operand")))]
+  [(set (match_operand:VEC_IC 0 "vint_operand")
+	(gtu:VEC_IC (match_operand:VEC_IC 1 "vint_operand")
+		    (match_operand:VEC_IC 2 "vint_operand")))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
   "")
 
-(define_expand "vector_gtuv1ti"
-  [(set (match_operand:V1TI 0 "altivec_register_operand")
-	(gtu:V1TI (match_operand:V1TI 1 "altivec_register_operand")
-		  (match_operand:V1TI 2 "altivec_register_operand")))]
-  "TARGET_POWER10"
-  "")
-
 ; >= for integer vectors: swap operands and apply not-greater-than
 (define_expand "vector_nltu<mode>"
-  [(set (match_operand:VEC_I 3 "vlogical_operand")
-	(gtu:VEC_I (match_operand:VEC_I 2 "vlogical_operand")
-	 	   (match_operand:VEC_I 1 "vlogical_operand")))
-   (set (match_operand:VEC_I 0 "vlogical_operand")
-        (not:VEC_I (match_dup 3)))]
+  [(set (match_operand:VEC_IC 3 "vlogical_operand")
+	(gtu:VEC_IC (match_operand:VEC_IC 2 "vlogical_operand")
+		    (match_operand:VEC_IC 1 "vlogical_operand")))
+   (set (match_operand:VEC_IC 0 "vlogical_operand")
+	(not:VEC_IC (match_dup 3)))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
 {
   operands[3] = gen_reg_rtx_and_attrs (operands[0]);
 })
 
-(define_expand "vector_nltuv1ti"
-  [(set (match_operand:V1TI 3 "vlogical_operand")
-	(gtu:V1TI (match_operand:V1TI 2 "vlogical_operand")
-		  (match_operand:V1TI 1 "vlogical_operand")))
-   (set (match_operand:V1TI 0 "vlogical_operand")
-	(not:V1TI (match_dup 3)))]
-  "TARGET_POWER10"
-{
-  operands[3] = gen_reg_rtx_and_attrs (operands[0]);
-})
-
 (define_expand "vector_geu<mode>"
-  [(set (match_operand:VEC_I 0 "vint_operand")
-	(geu:VEC_I (match_operand:VEC_I 1 "vint_operand")
-		   (match_operand:VEC_I 2 "vint_operand")))]
+  [(set (match_operand:VEC_IC 0 "vint_operand")
+	(geu:VEC_IC (match_operand:VEC_IC 1 "vint_operand")
+		    (match_operand:VEC_IC 2 "vint_operand")))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
   "")
 
 ; <= for integer vectors: apply not-greater-than
 (define_expand "vector_ngt<mode>"
-  [(set (match_operand:VEC_I 3 "vlogical_operand")
-	(gt:VEC_I (match_operand:VEC_I 1 "vlogical_operand")
-		  (match_operand:VEC_I 2 "vlogical_operand")))
-   (set (match_operand:VEC_I 0 "vlogical_operand")
-        (not:VEC_I (match_dup 3)))]
+  [(set (match_operand:VEC_IC 3 "vlogical_operand")
+	(gt:VEC_IC (match_operand:VEC_IC 1 "vlogical_operand")
+		   (match_operand:VEC_IC 2 "vlogical_operand")))
+   (set (match_operand:VEC_IC 0 "vlogical_operand")
+	(not:VEC_IC (match_dup 3)))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
 {
   operands[3] = gen_reg_rtx_and_attrs (operands[0]);
 })
 
-(define_expand "vector_ngtv1ti"
-  [(set (match_operand:V1TI 3 "vlogical_operand")
-	(gt:V1TI (match_operand:V1TI 1 "vlogical_operand")
-		 (match_operand:V1TI 2 "vlogical_operand")))
-   (set (match_operand:V1TI 0 "vlogical_operand")
-        (not:V1TI (match_dup 3)))]
-  "TARGET_POWER10"
-{
-  operands[3] = gen_reg_rtx_and_attrs (operands[0]);
-})
-
 (define_expand "vector_ngtu<mode>"
-  [(set (match_operand:VEC_I 3 "vlogical_operand")
-	(gtu:VEC_I (match_operand:VEC_I 1 "vlogical_operand")
-	 	   (match_operand:VEC_I 2 "vlogical_operand")))
-   (set (match_operand:VEC_I 0 "vlogical_operand")
-        (not:VEC_I (match_dup 3)))]
+  [(set (match_operand:VEC_IC 3 "vlogical_operand")
+	(gtu:VEC_IC (match_operand:VEC_IC 1 "vlogical_operand")
+		    (match_operand:VEC_IC 2 "vlogical_operand")))
+   (set (match_operand:VEC_IC 0 "vlogical_operand")
+	(not:VEC_IC (match_dup 3)))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
 {
   operands[3] = gen_reg_rtx_and_attrs (operands[0]);
 })
 
-(define_expand "vector_ngtuv1ti"
-  [(set (match_operand:V1TI 3 "vlogical_operand")
-	(gtu:V1TI (match_operand:V1TI 1 "vlogical_operand")
-		  (match_operand:V1TI 2 "vlogical_operand")))
-   (set (match_operand:V1TI 0 "vlogical_operand")
-        (not:V1TI (match_dup 3)))]
-  "TARGET_POWER10"
-{
-  operands[3] = gen_reg_rtx_and_attrs (operands[0]);
-})
-
 ; There are 14 possible vector FP comparison operators, gt and eq of them have
 ; been expanded above, so just support 12 remaining operators here.
 
@@ -1189,27 +1141,15 @@
 (define_expand "vector_gtu_<mode>_p"
   [(parallel
     [(set (reg:CC CR6_REGNO)
-	  (unspec:CC [(gtu:CC (match_operand:VEC_I 1 "vint_operand")
-			      (match_operand:VEC_I 2 "vint_operand"))]
+	  (unspec:CC [(gtu:CC (match_operand:VEC_IC 1 "vint_operand")
+			      (match_operand:VEC_IC 2 "vint_operand"))]
 		     UNSPEC_PREDICATE))
-     (set (match_operand:VEC_I 0 "vlogical_operand")
-	  (gtu:VEC_I (match_dup 1)
-		     (match_dup 2)))])]
+     (set (match_operand:VEC_IC 0 "vlogical_operand")
+	  (gtu:VEC_IC (match_dup 1)
+		      (match_dup 2)))])]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
   "")
 
-(define_expand "vector_gtu_v1ti_p"
-  [(parallel
-    [(set (reg:CC CR6_REGNO)
-	  (unspec:CC [(gtu:CC (match_operand:V1TI 1 "altivec_register_operand")
-			      (match_operand:V1TI 2 "altivec_register_operand"))]
-		     UNSPEC_PREDICATE))
-     (set (match_operand:V1TI 0 "altivec_register_operand")
-	  (gtu:V1TI (match_dup 1)
-		    (match_dup 2)))])]
-  "TARGET_POWER10"
-  "")
-
 ;; AltiVec/VSX predicates.
 
 ;; This expansion is triggered during expansion of predicate built-in
@@ -1582,25 +1522,21 @@
 
 ;; Expanders for rotate each element in a vector
 (define_expand "vrotl<mode>3"
-  [(set (match_operand:VEC_I 0 "vint_operand")
-	(rotate:VEC_I (match_operand:VEC_I 1 "vint_operand")
-		      (match_operand:VEC_I 2 "vint_operand")))]
+  [(set (match_operand:VEC_IC 0 "vint_operand")
+	(rotate:VEC_IC (match_operand:VEC_IC 1 "vint_operand")
+		       (match_operand:VEC_IC 2 "vint_operand")))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
-  "")
-
-(define_expand "vrotlv1ti3"
-  [(set (match_operand:V1TI 0 "vsx_register_operand" "=v")
-        (rotate:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v")
-                     (match_operand:V1TI 2 "vsx_register_operand" "v")))]
-  "TARGET_POWER10"
 {
-  /* Shift amount in needs to be put in bits[57:63] of 128-bit operand2. */
-  rtx tmp = gen_reg_rtx (V1TImode);
+  /* Shift amount in needs to be put in bits[57:63] of 128-bit operand2.  */
+  if (<MODE>mode == V1TImode)
+    {
+      rtx tmp = gen_reg_rtx (V1TImode);
 
-  emit_insn (gen_xxswapd_v1ti (tmp, operands[2]));
-  emit_insn (gen_altivec_vrlq (operands[0], operands[1], tmp));
-  DONE;
-})
+      emit_insn (gen_xxswapd_v1ti (tmp, operands[2]));
+      emit_insn (gen_altivec_vrlq (operands[0], operands[1], tmp));
+      DONE;
+    }
+ })
 
 ;; Expanders for rotatert to make use of vrotl
 (define_expand "vrotr<mode>3"
@@ -1663,25 +1599,20 @@
 
 ;; Expanders for arithmetic shift right on each vector element
 (define_expand "vashr<mode>3"
-  [(set (match_operand:VEC_I 0 "vint_operand")
-	(ashiftrt:VEC_I (match_operand:VEC_I 1 "vint_operand")
-			(match_operand:VEC_I 2 "vint_operand")))]
+  [(set (match_operand:VEC_IC 0 "vint_operand")
+	(ashiftrt:VEC_IC (match_operand:VEC_IC 1 "vint_operand")
+			 (match_operand:VEC_IC 2 "vint_operand")))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
-  "")
-
-;; No immediate version of this 128-bit instruction
-(define_expand "vashrv1ti3"
-  [(set (match_operand:V1TI 0 "vsx_register_operand" "=v")
-	(ashiftrt:V1TI (match_operand:V1TI 1 "vsx_register_operand" "v")
-		       (match_operand:V1TI 2 "vsx_register_operand" "v")))]
-  "TARGET_POWER10"
 {
-  /* Shift amount in needs to be put into bits[57:63] of 128-bit operand2. */
-  rtx tmp = gen_reg_rtx (V1TImode);
+  /* Shift amount in needs to be put in bits[57:63] of 128-bit operand2.  */
+  if (<MODE>mode == V1TImode)
+    {
+      rtx tmp = gen_reg_rtx (V1TImode);
 
-  emit_insn (gen_xxswapd_v1ti (tmp, operands[2]));
-  emit_insn (gen_altivec_vsraq (operands[0], operands[1], tmp));
-  DONE;
+      emit_insn (gen_xxswapd_v1ti (tmp, operands[2]));
+      emit_insn (gen_altivec_vsraq (operands[0], operands[1], tmp));
+      DONE;
+    }
 })
 
 
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 15bd86d..e226a93 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -186,15 +186,6 @@
 				 (V4SF "vecfdiv")
 				 (DF   "ddiv")])
 
-;; Map the scalar mode for a vector type
-(define_mode_attr VS_scalar [(V1TI	"TI")
-			     (V2DF	"DF")
-			     (V2DI	"DI")
-			     (V4SF	"SF")
-			     (V4SI	"SI")
-			     (V8HI	"HI")
-			     (V16QI	"QI")])
-
 ;; Map to a double-sized vector mode
 (define_mode_attr VS_double [(V4SI	"V8SI")
 			     (V4SF	"V8SF")
@@ -2996,8 +2987,8 @@
 (define_insn "vsx_concat_<mode>"
   [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa,we")
 	(vec_concat:VSX_D
-	 (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa,b")
-	 (match_operand:<VS_scalar> 2 "gpc_reg_operand" "wa,b")))]
+	 (match_operand:<VEC_base> 1 "gpc_reg_operand" "wa,b")
+	 (match_operand:<VEC_base> 2 "gpc_reg_operand" "wa,b")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
   if (which_alternative == 0)
@@ -3020,10 +3011,10 @@
 (define_insn "*vsx_concat_<mode>_1"
   [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
 	(vec_concat:VSX_D
-	 (vec_select:<VS_scalar>
+	 (vec_select:<VEC_base>
 	  (match_operand:VSX_D 1 "gpc_reg_operand" "wa")
 	  (parallel [(match_operand:QI 2 "const_0_to_1_operand" "n")]))
-	 (match_operand:<VS_scalar> 3 "gpc_reg_operand" "wa")))]
+	 (match_operand:<VEC_base> 3 "gpc_reg_operand" "wa")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
   HOST_WIDE_INT dword = INTVAL (operands[2]);
@@ -3043,8 +3034,8 @@
 (define_insn "*vsx_concat_<mode>_2"
   [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
 	(vec_concat:VSX_D
-	 (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa")
-	 (vec_select:<VS_scalar>
+	 (match_operand:<VEC_base> 1 "gpc_reg_operand" "wa")
+	 (vec_select:<VEC_base>
 	  (match_operand:VSX_D 2 "gpc_reg_operand" "wa")
 	  (parallel [(match_operand:QI 3 "const_0_to_1_operand" "n")]))))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
@@ -3066,10 +3057,10 @@
 (define_insn "*vsx_concat_<mode>_3"
   [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
 	(vec_concat:VSX_D
-	 (vec_select:<VS_scalar>
+	 (vec_select:<VEC_base>
 	  (match_operand:VSX_D 1 "gpc_reg_operand" "wa")
 	  (parallel [(match_operand:QI 2 "const_0_to_1_operand" "n")]))
-	 (vec_select:<VS_scalar>
+	 (vec_select:<VEC_base>
 	  (match_operand:VSX_D 3 "gpc_reg_operand" "wa")
 	  (parallel [(match_operand:QI 4 "const_0_to_1_operand" "n")]))))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
@@ -3367,7 +3358,7 @@
 (define_expand "vsx_set_<mode>"
   [(use (match_operand:VSX_D 0 "vsx_register_operand"))
    (use (match_operand:VSX_D 1 "vsx_register_operand"))
-   (use (match_operand:<VS_scalar> 2 "gpc_reg_operand"))
+   (use (match_operand:<VEC_base> 2 "gpc_reg_operand"))
    (use (match_operand:QI 3 "const_0_to_1_operand"))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
@@ -3375,7 +3366,7 @@
   rtx vec_reg = operands[1];
   rtx value = operands[2];
   rtx ele = operands[3];
-  rtx tmp = gen_reg_rtx (<VS_scalar>mode);
+  rtx tmp = gen_reg_rtx (<VEC_base>mode);
 
   if (ele == const0_rtx)
     {
@@ -3397,15 +3388,12 @@
 ;; Optimize cases were we can do a simple or direct move.
 ;; Or see if we can avoid doing the move at all
 
-;; There are some unresolved problems with reload that show up if an Altivec
-;; register was picked.  Limit the scalar value to FPRs for now.
-
 (define_insn "vsx_extract_<mode>"
-  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=d, d,  wr, wr")
-	(vec_select:<VS_scalar>
-	 (match_operand:VSX_D 1 "gpc_reg_operand"      "wa, wa, wa, wa")
+  [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=wa, wa, wr, wr")
+	(vec_select:<VEC_base>
+	 (match_operand:VSX_D 1 "gpc_reg_operand"       "wa, wa, wa, wa")
 	 (parallel
-	  [(match_operand:QI 2 "const_0_to_1_operand"  "wD, n,  wD, n")])))]
+	  [(match_operand:QI 2 "const_0_to_1_operand"   "wD, n,  wD, n")])))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
   int element = INTVAL (operands[2]);
@@ -3456,8 +3444,8 @@
 
 ;; Optimize extracting a single scalar element from memory.
 (define_insn_and_split "*vsx_extract_<P:mode>_<VSX_D:mode>_load"
-  [(set (match_operand:<VS_scalar> 0 "register_operand" "=wa,wr")
-	(vec_select:<VSX_D:VS_scalar>
+  [(set (match_operand:<VEC_base> 0 "register_operand" "=wa,wr")
+	(vec_select:<VSX_D:VEC_base>
 	 (match_operand:VSX_D 1 "memory_operand" "m,m")
 	 (parallel [(match_operand:QI 2 "const_0_to_1_operand" "n,n")])))
    (clobber (match_scratch:P 3 "=&b,&b"))]
@@ -3467,7 +3455,7 @@
   [(set (match_dup 0) (match_dup 4))]
 {
   operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
-					   operands[3], <VSX_D:VS_scalar>mode);
+					   operands[3], <VSX_D:VEC_base>mode);
 }
   [(set_attr "type" "fpload,load")
    (set_attr "length" "8")])
@@ -3475,8 +3463,8 @@
 ;; Optimize storing a single scalar element that is the right location to
 ;; memory
 (define_insn "*vsx_extract_<mode>_store"
-  [(set (match_operand:<VS_scalar> 0 "memory_operand" "=m,Z,wY")
-	(vec_select:<VS_scalar>
+  [(set (match_operand:<VEC_base> 0 "memory_operand" "=m,Z,wY")
+	(vec_select:<VEC_base>
 	 (match_operand:VSX_D 1 "register_operand" "d,v,v")
 	 (parallel [(match_operand:QI 2 "vsx_scalar_64bit" "wD,wD,wD")])))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
@@ -3489,8 +3477,8 @@
 
 ;; Variable V2DI/V2DF extract shift
 (define_insn "vsx_vslo_<mode>"
-  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=v")
-	(unspec:<VS_scalar> [(match_operand:VSX_D 1 "gpc_reg_operand" "v")
+  [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=v")
+	(unspec:<VEC_base> [(match_operand:VSX_D 1 "gpc_reg_operand" "v")
 			     (match_operand:V2DI 2 "gpc_reg_operand" "v")]
 			    UNSPEC_VSX_VSLO))]
   "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
@@ -3499,8 +3487,8 @@
 
 ;; Variable V2DI/V2DF extract from a register
 (define_insn_and_split "vsx_extract_<mode>_var"
-  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=v")
-	(unspec:<VS_scalar> [(match_operand:VSX_D 1 "gpc_reg_operand" "v")
+  [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=v")
+	(unspec:<VEC_base> [(match_operand:VSX_D 1 "gpc_reg_operand" "v")
 			     (match_operand:DI 2 "gpc_reg_operand" "r")]
 			    UNSPEC_VSX_EXTRACT))
    (clobber (match_scratch:DI 3 "=r"))
@@ -3517,8 +3505,8 @@
 
 ;; Variable V2DI/V2DF extract from memory
 (define_insn_and_split "*vsx_extract_<mode>_var_load"
-  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=wa,r")
-	(unspec:<VS_scalar> [(match_operand:VSX_D 1 "memory_operand" "Q,Q")
+  [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=wa,r")
+	(unspec:<VEC_base> [(match_operand:VSX_D 1 "memory_operand" "Q,Q")
 			     (match_operand:DI 2 "gpc_reg_operand" "r,r")]
 			    UNSPEC_VSX_EXTRACT))
    (clobber (match_scratch:DI 3 "=&b,&b"))]
@@ -3528,7 +3516,7 @@
   [(set (match_dup 0) (match_dup 4))]
 {
   operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
-					   operands[3], <VS_scalar>mode);
+					   operands[3], <VEC_base>mode);
 }
   [(set_attr "type" "fpload,load")])
 
@@ -3737,8 +3725,8 @@
 ;; none of the small types were allowed in a vector register, so we had to
 ;; extract to a DImode and either do a direct move or store.
 (define_expand  "vsx_extract_<mode>"
-  [(parallel [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand")
-		   (vec_select:<VS_scalar>
+  [(parallel [(set (match_operand:<VEC_base> 0 "gpc_reg_operand")
+		   (vec_select:<VEC_base>
 		    (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand")
 		    (parallel [(match_operand:QI 2 "const_int_operand")])))
 	      (clobber (match_scratch:VSX_EXTRACT_I 3))])]
@@ -3754,8 +3742,8 @@
 })
 
 (define_insn "vsx_extract_<mode>_p9"
-  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=r,<VSX_EX>")
-	(vec_select:<VS_scalar>
+  [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=r,<VSX_EX>")
+	(vec_select:<VEC_base>
 	 (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,<VSX_EX>")
 	 (parallel [(match_operand:QI 2 "<VSX_EXTRACT_PREDICATE>" "n,n")])))
    (clobber (match_scratch:SI 3 "=r,X"))]
@@ -3785,8 +3773,8 @@
    (set_attr "isa" "p9v,*")])
 
 (define_split
-  [(set (match_operand:<VS_scalar> 0 "int_reg_operand")
-	(vec_select:<VS_scalar>
+  [(set (match_operand:<VEC_base> 0 "int_reg_operand")
+	(vec_select:<VEC_base>
 	 (match_operand:VSX_EXTRACT_I 1 "altivec_register_operand")
 	 (parallel [(match_operand:QI 2 "const_int_operand")])))
    (clobber (match_operand:SI 3 "int_reg_operand"))]
@@ -3811,7 +3799,7 @@
 (define_insn_and_split "*vsx_extract_<mode>_di_p9"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=r,<VSX_EX>")
 	(zero_extend:DI
-	 (vec_select:<VS_scalar>
+	 (vec_select:<VEC_base>
 	  (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,<VSX_EX>")
 	  (parallel [(match_operand:QI 2 "const_int_operand" "n,n")]))))
    (clobber (match_scratch:SI 3 "=r,X"))]
@@ -3819,28 +3807,28 @@
   "#"
   "&& reload_completed"
   [(parallel [(set (match_dup 4)
-		   (vec_select:<VS_scalar>
+		   (vec_select:<VEC_base>
 		    (match_dup 1)
 		    (parallel [(match_dup 2)])))
 	      (clobber (match_dup 3))])]
 {
-  operands[4] = gen_rtx_REG (<VS_scalar>mode, REGNO (operands[0]));
+  operands[4] = gen_rtx_REG (<VEC_base>mode, REGNO (operands[0]));
 }
   [(set_attr "isa" "p9v,*")])
 
 ;; Optimize stores to use the ISA 3.0 scalar store instructions
 (define_insn_and_split "*vsx_extract_<mode>_store_p9"
-  [(set (match_operand:<VS_scalar> 0 "memory_operand" "=Z,m")
-	(vec_select:<VS_scalar>
+  [(set (match_operand:<VEC_base> 0 "memory_operand" "=Z,m")
+	(vec_select:<VEC_base>
 	 (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "<VSX_EX>,v")
 	 (parallel [(match_operand:QI 2 "const_int_operand" "n,n")])))
-   (clobber (match_scratch:<VS_scalar> 3 "=<VSX_EX>,&*r"))
+   (clobber (match_scratch:<VEC_base> 3 "=<VSX_EX>,&*r"))
    (clobber (match_scratch:SI 4 "=X,&r"))]
   "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB"
   "#"
   "&& reload_completed"
   [(parallel [(set (match_dup 3)
-		   (vec_select:<VS_scalar>
+		   (vec_select:<VEC_base>
 		    (match_dup 1)
 		    (parallel [(match_dup 2)])))
 	      (clobber (match_dup 4))])
@@ -3901,8 +3889,8 @@
    (set_attr "isa" "*,p8v,*")])
 
 (define_insn_and_split  "*vsx_extract_<mode>_p8"
-  [(set (match_operand:<VS_scalar> 0 "nonimmediate_operand" "=r")
-	(vec_select:<VS_scalar>
+  [(set (match_operand:<VEC_base> 0 "nonimmediate_operand" "=r")
+	(vec_select:<VEC_base>
 	 (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand" "v")
 	 (parallel [(match_operand:QI 2 "<VSX_EXTRACT_PREDICATE>" "n")])))
    (clobber (match_scratch:VSX_EXTRACT_I2 3 "=v"))]
@@ -3949,8 +3937,8 @@
 
 ;; Optimize extracting a single scalar element from memory.
 (define_insn_and_split "*vsx_extract_<mode>_load"
-  [(set (match_operand:<VS_scalar> 0 "register_operand" "=r")
-	(vec_select:<VS_scalar>
+  [(set (match_operand:<VEC_base> 0 "register_operand" "=r")
+	(vec_select:<VEC_base>
 	 (match_operand:VSX_EXTRACT_I 1 "memory_operand" "m")
 	 (parallel [(match_operand:QI 2 "<VSX_EXTRACT_PREDICATE>" "n")])))
    (clobber (match_scratch:DI 3 "=&b"))]
@@ -3960,15 +3948,15 @@
   [(set (match_dup 0) (match_dup 4))]
 {
   operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
-					   operands[3], <VS_scalar>mode);
+					   operands[3], <VEC_base>mode);
 }
   [(set_attr "type" "load")
    (set_attr "length" "8")])
 
 ;; Variable V16QI/V8HI/V4SI extract from a register
 (define_insn_and_split "vsx_extract_<mode>_var"
-  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=r,r")
-	(unspec:<VS_scalar>
+  [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=r,r")
+	(unspec:<VEC_base>
 	 [(match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,v")
 	  (match_operand:DI 2 "gpc_reg_operand" "r,r")]
 	 UNSPEC_VSX_EXTRACT))
@@ -3987,8 +3975,8 @@
 
 ;; Variable V16QI/V8HI/V4SI extract from memory
 (define_insn_and_split "*vsx_extract_<mode>_var_load"
-  [(set (match_operand:<VS_scalar> 0 "gpc_reg_operand" "=r")
-	(unspec:<VS_scalar>
+  [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=r")
+	(unspec:<VEC_base>
 	 [(match_operand:VSX_EXTRACT_I 1 "memory_operand" "Q")
 	  (match_operand:DI 2 "gpc_reg_operand" "r")]
 	 UNSPEC_VSX_EXTRACT))
@@ -3999,7 +3987,7 @@
   [(set (match_dup 0) (match_dup 4))]
 {
   operands[4] = rs6000_adjust_vec_address (operands[0], operands[1], operands[2],
-					   operands[3], <VS_scalar>mode);
+					   operands[3], <VEC_base>mode);
 }
   [(set_attr "type" "load")])
 
@@ -4175,7 +4163,7 @@
 (define_expand "vreplace_elt_<mode>"
   [(set (match_operand:REPLACE_ELT 0 "register_operand")
   (unspec:REPLACE_ELT [(match_operand:REPLACE_ELT 1 "register_operand")
-		       (match_operand:<VS_scalar> 2 "register_operand")
+		       (match_operand:<VEC_base> 2 "register_operand")
 		       (match_operand:QI 3 "const_0_to_3_operand")]
 		      UNSPEC_REPLACE_ELT))]
  "TARGET_POWER10"
@@ -4199,7 +4187,7 @@
 (define_insn "vreplace_elt_<mode>_inst"
  [(set (match_operand:REPLACE_ELT 0 "register_operand" "=v")
   (unspec:REPLACE_ELT [(match_operand:REPLACE_ELT 1 "register_operand" "0")
-		       (match_operand:<VS_scalar> 2 "register_operand" "r")
+		       (match_operand:<VEC_base> 2 "register_operand" "r")
 		       (match_operand:QI 3 "const_0_to_12_operand" "n")]
 		      UNSPEC_REPLACE_ELT))]
  "TARGET_POWER10"
@@ -4209,7 +4197,7 @@
 (define_insn "vreplace_un_<mode>"
  [(set (match_operand:V16QI 0 "register_operand" "=v")
   (unspec:V16QI [(match_operand:REPLACE_ELT 1 "register_operand" "0")
-                 (match_operand:<VS_scalar> 2 "register_operand" "r")
+                 (match_operand:<VEC_base> 2 "register_operand" "r")
 		 (match_operand:QI 3 "const_0_to_12_operand" "n")]
 		UNSPEC_REPLACE_UN))]
  "TARGET_POWER10"
@@ -4325,19 +4313,19 @@
 ;; Where <ftype> is SFmode, DFmode (and KFmode/TFmode if those types are IEEE
 ;; 128-bit hardware types) and <vtype> is vector char, vector unsigned char,
 ;; vector short or vector unsigned short.
-(define_insn_and_split "*vsx_ext_<VSX_EXTRACT_I:VS_scalar>_fl_<FL_CONV:mode>"
+(define_insn_and_split "*vsx_ext_<VSX_EXTRACT_I:VEC_base>_fl_<FL_CONV:mode>"
   [(set (match_operand:FL_CONV 0 "gpc_reg_operand" "=wa")
 	(float:FL_CONV
-	 (vec_select:<VSX_EXTRACT_I:VS_scalar>
+	 (vec_select:<VSX_EXTRACT_I:VEC_base>
 	  (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v")
 	  (parallel [(match_operand:QI 2 "const_int_operand" "n")]))))
-   (clobber (match_scratch:<VSX_EXTRACT_I:VS_scalar> 3 "=v"))]
+   (clobber (match_scratch:<VSX_EXTRACT_I:VEC_base> 3 "=v"))]
   "VECTOR_MEM_VSX_P (<VSX_EXTRACT_I:MODE>mode) && TARGET_DIRECT_MOVE_64BIT
    && TARGET_P9_VECTOR"
   "#"
   "&& reload_completed"
   [(parallel [(set (match_dup 3)
-		   (vec_select:<VSX_EXTRACT_I:VS_scalar>
+		   (vec_select:<VSX_EXTRACT_I:VEC_base>
 		    (match_dup 1)
 		    (parallel [(match_dup 2)])))
 	      (clobber (scratch:SI))])
@@ -4350,19 +4338,19 @@
 }
   [(set_attr "isa" "<FL_CONV:VSisa>")])
 
-(define_insn_and_split "*vsx_ext_<VSX_EXTRACT_I:VS_scalar>_ufl_<FL_CONV:mode>"
+(define_insn_and_split "*vsx_ext_<VSX_EXTRACT_I:VEC_base>_ufl_<FL_CONV:mode>"
   [(set (match_operand:FL_CONV 0 "gpc_reg_operand" "=wa")
 	(unsigned_float:FL_CONV
-	 (vec_select:<VSX_EXTRACT_I:VS_scalar>
+	 (vec_select:<VSX_EXTRACT_I:VEC_base>
 	  (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v")
 	  (parallel [(match_operand:QI 2 "const_int_operand" "n")]))))
-   (clobber (match_scratch:<VSX_EXTRACT_I:VS_scalar> 3 "=v"))]
+   (clobber (match_scratch:<VSX_EXTRACT_I:VEC_base> 3 "=v"))]
   "VECTOR_MEM_VSX_P (<VSX_EXTRACT_I:MODE>mode) && TARGET_DIRECT_MOVE_64BIT
    && TARGET_P9_VECTOR"
   "#"
   "&& reload_completed"
   [(parallel [(set (match_dup 3)
-		   (vec_select:<VSX_EXTRACT_I:VS_scalar>
+		   (vec_select:<VSX_EXTRACT_I:VEC_base>
 		    (match_dup 1)
 		    (parallel [(match_dup 2)])))
 	      (clobber (scratch:SI))])
@@ -4378,7 +4366,7 @@
   [(set (match_operand:VSX_EXTRACT_I 0 "gpc_reg_operand" "=<VSX_EX>")
 	(unspec:VSX_EXTRACT_I
 	 [(match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "0")
-	  (match_operand:<VS_scalar> 2 "gpc_reg_operand" "<VSX_EX>")
+	  (match_operand:<VEC_base> 2 "gpc_reg_operand" "<VSX_EX>")
 	  (match_operand:QI 3 "<VSX_EXTRACT_PREDICATE>" "n")]
 	 UNSPEC_VSX_SET))]
   "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_P9_VECTOR && TARGET_POWERPC64"
@@ -4389,7 +4377,7 @@
   if (!BYTES_BIG_ENDIAN)
     ele = nunits - 1 - ele;
 
-  operands[3] = GEN_INT (GET_MODE_SIZE (<VS_scalar>mode) * ele);
+  operands[3] = GEN_INT (GET_MODE_SIZE (<VEC_base>mode) * ele);
   if (<MODE>mode == V4SImode)
     return "xxinsertw %x0,%x2,%3";
   else
@@ -4562,20 +4550,20 @@
 (define_expand "vsx_splat_<mode>"
   [(set (match_operand:VSX_D 0 "vsx_register_operand")
 	(vec_duplicate:VSX_D
-	 (match_operand:<VS_scalar> 1 "input_operand")))]
+	 (match_operand:<VEC_base> 1 "input_operand")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
 {
   rtx op1 = operands[1];
   if (MEM_P (op1))
     operands[1] = rs6000_force_indexed_or_indirect_mem (op1);
   else if (!REG_P (op1))
-    op1 = force_reg (<VSX_D:VS_scalar>mode, op1);
+    op1 = force_reg (<VSX_D:VEC_base>mode, op1);
 })
 
 (define_insn "vsx_splat_<mode>_reg"
   [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa,we")
 	(vec_duplicate:VSX_D
-	 (match_operand:<VS_scalar> 1 "gpc_reg_operand" "wa,b")))]
+	 (match_operand:<VEC_base> 1 "gpc_reg_operand" "wa,b")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
   "@
    xxpermdi %x0,%x1,%x1,0
@@ -4585,7 +4573,7 @@
 (define_insn "vsx_splat_<mode>_mem"
   [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
 	(vec_duplicate:VSX_D
-	 (match_operand:<VSX_D:VS_scalar> 1 "memory_operand" "Z")))]
+	 (match_operand:<VSX_D:VEC_base> 1 "memory_operand" "Z")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
   "lxvdsx %x0,%y1"
   [(set_attr "type" "vecload")])
@@ -4641,7 +4629,7 @@
 (define_insn "vsx_xxspltw_<mode>"
   [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wa")
 	(vec_duplicate:VSX_W
-	 (vec_select:<VS_scalar>
+	 (vec_select:<VEC_base>
 	  (match_operand:VSX_W 1 "vsx_register_operand" "wa")
 	  (parallel
 	   [(match_operand:QI 2 "u5bit_cint_operand" "n")]))))]
@@ -4667,7 +4655,7 @@
 (define_insn "vsx_vsplt<VSX_SPLAT_SUFFIX>_di"
   [(set (match_operand:VSX_SPLAT_I 0 "altivec_register_operand" "=v")
 	(vec_duplicate:VSX_SPLAT_I
-	 (truncate:<VS_scalar>
+	 (truncate:<VEC_base>
 	  (match_operand:DI 1 "altivec_register_operand" "v"))))]
   "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
   "vsplt<VSX_SPLAT_SUFFIX> %0,%1,<VSX_SPLAT_COUNT>"
diff --git a/gcc/config/s390/3931.md b/gcc/config/s390/3931.md
new file mode 100644
index 0000000..bc97bc5
--- /dev/null
+++ b/gcc/config/s390/3931.md
@@ -0,0 +1,2562 @@
+;; Scheduling description for z16.
+;;   Copyright (C) 2022 Free Software Foundation, Inc.
+;;   Contributed by Robin Dapp (rdapp@linux.ibm.com)
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify it under
+;; the terms of the GNU General Public License as published by the Free
+;; Software Foundation; either version 3, or (at your option) any later
+;; version.
+
+;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;; for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_attr "z16_unit_fpd" ""
+(cond [(eq_attr "mnemonic"
+"ddb,
+ddbr,
+deb,
+debr,
+dxbr,
+sqdb,
+sqdbr,
+sqeb,
+sqebr,
+sqxbr,
+vfddb,
+vfdsb,
+vfsqdb,
+vfsqsb,
+wfddb,
+wfdsb,
+wfdxb,
+wfsqdb,
+wfsqxb"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_attr "z16_unit_fxa" ""
+(cond [(eq_attr "mnemonic"
+"a,
+afi,
+ag,
+agf,
+agfi,
+agfr,
+agh,
+aghi,
+aghik,
+agr,
+agrk,
+ah,
+ahi,
+ahik,
+ahy,
+al,
+alc,
+alcg,
+alcgr,
+alcr,
+alfi,
+alg,
+algf,
+algfi,
+algfr,
+alghsik,
+algr,
+algrk,
+alhsik,
+alr,
+alrk,
+aly,
+ar,
+ark,
+ay,
+bras,
+brasl,
+etnd,
+exrl,
+flogr,
+ic,
+icm,
+icmh,
+icmy,
+icy,
+iihf,
+iilf,
+ipm,
+la,
+larl,
+lay,
+lb,
+lbr,
+lcgr,
+lcr,
+lgb,
+lgbr,
+lgf,
+lgfi,
+lgfr,
+lgfrl,
+lgh,
+lghi,
+lghr,
+lghrl,
+lgr,
+lh,
+lhi,
+lhr,
+lhrl,
+lhy,
+llcr,
+llgcr,
+llgfr,
+llghr,
+llgtr,
+llhr,
+llihf,
+llihh,
+llihl,
+llilf,
+llilh,
+llill,
+lngr,
+lnr,
+loc,
+locg,
+locghi,
+locgr,
+lochi,
+locr,
+lpgr,
+lpr,
+lr,
+lrv,
+lrvg,
+lrvgr,
+lrvh,
+lrvr,
+lt,
+ltg,
+ltgf,
+ltgfr,
+ltgr,
+ltr,
+m,
+mfy,
+mg,
+mgh,
+mghi,
+mgrk,
+mh,
+mhi,
+mhy,
+ml,
+mlg,
+mlgr,
+mlr,
+mr,
+ms,
+msc,
+msfi,
+msg,
+msgc,
+msgf,
+msgfi,
+msgfr,
+msgr,
+msgrkc,
+msr,
+msrkc,
+msy,
+n,
+ncgrk,
+ncrk,
+ng,
+ngr,
+ngrk,
+nihf,
+nihh,
+nihl,
+nilf,
+nilh,
+nill,
+nngrk,
+nnrk,
+nogrk,
+nork,
+nr,
+nrk,
+nxgrk,
+nxrk,
+ny,
+o,
+ocgrk,
+ocrk,
+og,
+ogr,
+ogrk,
+oihf,
+oihh,
+oihl,
+oilf,
+oilh,
+oill,
+or,
+ork,
+oy,
+pfpo,
+popcnt,
+risbg,
+risbgn,
+rll,
+rllg,
+s,
+selgr,
+selr,
+sg,
+sgf,
+sgfr,
+sgh,
+sgr,
+sgrk,
+sh,
+shy,
+sl,
+slb,
+slbg,
+slbgr,
+slbr,
+slfi,
+slg,
+slgf,
+slgfi,
+slgfr,
+slgr,
+slgrk,
+sll,
+sllg,
+sllk,
+slr,
+slrk,
+sly,
+sr,
+sra,
+srag,
+srak,
+srk,
+srl,
+srlg,
+srlk,
+sy,
+x,
+xg,
+xgr,
+xgrk,
+xihf,
+xilf,
+xr,
+xrk,
+xy"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_attr "z16_unit_fxb" ""
+(cond [(eq_attr "mnemonic"
+"agsi,
+algsi,
+alsi,
+asi,
+b,
+bc,
+bcr,
+bi,
+br,
+brcl,
+c,
+cfi,
+cg,
+cgf,
+cgfi,
+cgfr,
+cgfrl,
+cgh,
+cghi,
+cghrl,
+cghsi,
+cgit,
+cgr,
+cgrl,
+cgrt,
+ch,
+chi,
+chrl,
+chsi,
+chy,
+cit,
+cl,
+clfhsi,
+clfi,
+clfit,
+clg,
+clgf,
+clgfi,
+clgfr,
+clgfrl,
+clghrl,
+clghsi,
+clgit,
+clgr,
+clgrl,
+clgrt,
+clgt,
+clhhsi,
+clhrl,
+cli,
+cliy,
+clm,
+clmy,
+clr,
+clrl,
+clrt,
+clt,
+cly,
+cr,
+crl,
+crt,
+cy,
+j,
+jg,
+laa,
+laag,
+lan,
+lang,
+lao,
+laog,
+lat,
+lax,
+laxg,
+lcdfr,
+ldgr,
+ldr,
+lgat,
+lgdr,
+lndfr,
+lpdfr,
+lzdr,
+lzer,
+mvghi,
+mvhhi,
+mvhi,
+mvi,
+mviy,
+ni,
+niy,
+nop,
+nopr,
+ntstg,
+oi,
+oiy,
+ppa,
+st,
+stc,
+stcy,
+std,
+stdy,
+ste,
+stey,
+stg,
+stgrl,
+sth,
+sthrl,
+sthy,
+stoc,
+stocg,
+strl,
+strv,
+strvg,
+strvh,
+sty,
+tend,
+tm,
+tmh,
+tmhh,
+tmhl,
+tml,
+tmlh,
+tmll,
+tmy,
+vlgvb,
+vlgvf,
+vlgvg,
+vlgvh,
+vlr,
+vlvgb,
+vlvgf,
+vlvgg,
+vlvgh,
+vlvgp,
+vst,
+vstbr,
+vstbrf,
+vstbrg,
+vstbrh,
+vstbrq,
+vstebrf,
+vstebrg,
+vstef,
+vsteg,
+vsterf,
+vsterg,
+vsterh,
+vstl,
+vstrl,
+vstrlr,
+xi,
+xiy"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_attr "z16_unit_fxd" ""
+(cond [(eq_attr "mnemonic"
+"dlgr,
+dlr,
+dr,
+dsgfr,
+dsgr"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_attr "z16_unit_lsu" ""
+(cond [(eq_attr "mnemonic"
+"a,
+adb,
+aeb,
+ag,
+agf,
+agh,
+agsi,
+ah,
+ahy,
+al,
+alc,
+alcg,
+alg,
+algf,
+algsi,
+alsi,
+aly,
+asi,
+ay,
+c,
+cdb,
+ceb,
+cg,
+cgf,
+cgfrl,
+cgh,
+cghrl,
+cghsi,
+cgrl,
+ch,
+chrl,
+chsi,
+chy,
+cl,
+clc,
+clfhsi,
+clg,
+clgf,
+clgfrl,
+clghrl,
+clghsi,
+clgrl,
+clgt,
+clhhsi,
+clhrl,
+cli,
+cliy,
+clm,
+clmy,
+clrl,
+clt,
+cly,
+crl,
+cy,
+ddb,
+deb,
+ear,
+ic,
+icm,
+icmh,
+icmy,
+icy,
+kdb,
+keb,
+l,
+laa,
+laag,
+lan,
+lang,
+lao,
+laog,
+lat,
+lax,
+laxg,
+lb,
+lcbb,
+ld,
+lde,
+ldeb,
+ldy,
+le,
+ley,
+lg,
+lgat,
+lgb,
+lgf,
+lgfrl,
+lgh,
+lghrl,
+lgrl,
+lh,
+lhrl,
+lhy,
+llc,
+llgc,
+llgf,
+llgfrl,
+llgh,
+llghrl,
+llgt,
+llh,
+llhrl,
+loc,
+locg,
+lrl,
+lrv,
+lrvg,
+lrvh,
+lt,
+ltg,
+ltgf,
+ly,
+m,
+madb,
+maeb,
+mdb,
+meeb,
+mfy,
+mg,
+mgh,
+mh,
+mhy,
+ml,
+mlg,
+ms,
+msc,
+msdb,
+mseb,
+msg,
+msgc,
+msgf,
+msy,
+mvghi,
+mvhhi,
+mvhi,
+mvi,
+mviy,
+n,
+ng,
+ni,
+niy,
+ntstg,
+ny,
+o,
+og,
+oi,
+oiy,
+oy,
+s,
+sar,
+sdb,
+seb,
+sfpc,
+sg,
+sgf,
+sgh,
+sh,
+shy,
+sl,
+slb,
+slbg,
+slg,
+slgf,
+sly,
+sqdb,
+sqeb,
+st,
+stc,
+stcy,
+std,
+stdy,
+ste,
+stey,
+stg,
+stgrl,
+sth,
+sthrl,
+sthy,
+stoc,
+stocg,
+strl,
+strv,
+strvg,
+strvh,
+sty,
+sy,
+tabort,
+tm,
+tmy,
+vl,
+vlbb,
+vlbr,
+vlbrf,
+vlbrg,
+vlbrh,
+vlbrq,
+vlbrrepf,
+vlbrrepg,
+vlbrreph,
+vleb,
+vlebrf,
+vlebrg,
+vlebrh,
+vlef,
+vleg,
+vleh,
+vlerf,
+vlerg,
+vlerh,
+vll,
+vllebrzf,
+vllebrzg,
+vllebrzh,
+vllezb,
+vllezf,
+vllezg,
+vllezh,
+vllezlf,
+vlrepb,
+vlrepf,
+vlrepg,
+vlreph,
+vlrl,
+vlrlr,
+vst,
+vstbr,
+vstbrf,
+vstbrg,
+vstbrh,
+vstbrq,
+vstebrf,
+vstebrg,
+vstef,
+vsteg,
+vsterf,
+vsterg,
+vsterh,
+vstl,
+vstrl,
+vstrlr,
+x,
+xg,
+xi,
+xiy,
+xy"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_attr "z16_unit_vfu" ""
+(cond [(eq_attr "mnemonic"
+"adb,
+adbr,
+adtr,
+aeb,
+aebr,
+axbr,
+axtr,
+cdb,
+cdbr,
+cdtr,
+ceb,
+cebr,
+cpsdr,
+cxbr,
+cxtr,
+ddtr,
+dxtr,
+fidbr,
+fidbra,
+fidtr,
+fiebr,
+fiebra,
+fixbr,
+fixbra,
+fixtr,
+kdb,
+kdbr,
+kdtr,
+keb,
+kebr,
+kxbr,
+kxtr,
+lcdbr,
+lcebr,
+lcxbr,
+ldeb,
+ldebr,
+ldetr,
+le,
+ledbr,
+ledtr,
+ler,
+ley,
+lndbr,
+lnebr,
+lnxbr,
+lpdbr,
+lpebr,
+lpxbr,
+ltdbr,
+ltdtr,
+ltebr,
+ltxbr,
+ltxtr,
+lxdb,
+lxdbr,
+lxdtr,
+lxeb,
+lxebr,
+madb,
+madbr,
+maeb,
+maebr,
+mdb,
+mdbr,
+mdtr,
+meeb,
+meebr,
+msdb,
+msdbr,
+mseb,
+msebr,
+mxbr,
+mxtr,
+sdb,
+sdbr,
+sdtr,
+seb,
+sebr,
+sxbr,
+sxtr,
+tcdb,
+tceb,
+tcxb,
+tdcdt,
+tdcet,
+tdcxt,
+vab,
+vaccb,
+vacccq,
+vaccf,
+vaccg,
+vacch,
+vaccq,
+vacq,
+vaf,
+vag,
+vah,
+vaq,
+vavgb,
+vavgf,
+vavgg,
+vavgh,
+vavglb,
+vavglf,
+vavglg,
+vavglh,
+vbperm,
+vcdgb,
+vcdlgb,
+vcefb,
+vcelfb,
+vceqb,
+vceqbs,
+vceqf,
+vceqfs,
+vceqg,
+vceqgs,
+vceqh,
+vceqhs,
+vcfeb,
+vcfn,
+vcgdb,
+vchb,
+vchbs,
+vchf,
+vchfs,
+vchg,
+vchgs,
+vchh,
+vchhs,
+vchlb,
+vchlbs,
+vchlf,
+vchlfs,
+vchlg,
+vchlgs,
+vchlh,
+vchlhs,
+vcksm,
+vclfeb,
+vclfnh,
+vclfnl,
+vclgdb,
+vclzb,
+vclzf,
+vclzg,
+vclzh,
+vcnf,
+vcrnf,
+vctzb,
+vctzf,
+vctzg,
+vctzh,
+verimb,
+verimf,
+verimg,
+verimh,
+verllb,
+verllf,
+verllg,
+verllh,
+verllvb,
+verllvf,
+verllvg,
+verllvh,
+veslb,
+veslf,
+veslg,
+veslh,
+veslvb,
+veslvf,
+veslvg,
+veslvh,
+vesrab,
+vesraf,
+vesrag,
+vesrah,
+vesravb,
+vesravf,
+vesravg,
+vesravh,
+vesrlb,
+vesrlf,
+vesrlg,
+vesrlh,
+vesrlvb,
+vesrlvf,
+vesrlvg,
+vesrlvh,
+vfadb,
+vfasb,
+vfcedb,
+vfcedbs,
+vfcesb,
+vfcesbs,
+vfchdb,
+vfchdbs,
+vfchedb,
+vfchedbs,
+vfchesb,
+vfchesbs,
+vfchsb,
+vfchsbs,
+vfeeb,
+vfeef,
+vfeeh,
+vfeezbs,
+vfeezfs,
+vfeezhs,
+vfeneb,
+vfenef,
+vfeneh,
+vfenezb,
+vfenezf,
+vfenezh,
+vfidb,
+vfisb,
+vfkedb,
+vfkesb,
+vfkhdb,
+vfkhedb,
+vfkhesb,
+vfkhsb,
+vflcdb,
+vflcsb,
+vflndb,
+vflnsb,
+vflpdb,
+vflpsb,
+vfmadb,
+vfmasb,
+vfmaxdb,
+vfmaxsb,
+vfmdb,
+vfmindb,
+vfminsb,
+vfmsb,
+vfmsdb,
+vfmssb,
+vfnmadb,
+vfnmasb,
+vfnmsdb,
+vfnmssb,
+vfsdb,
+vfssb,
+vftcidb,
+vftcisb,
+vgbm,
+vgfmab,
+vgfmaf,
+vgfmag,
+vgfmah,
+vgfmb,
+vgfmf,
+vgfmg,
+vgfmh,
+vgm,
+vgmb,
+vgmf,
+vgmg,
+vgmh,
+vistrb,
+vistrbs,
+vistrf,
+vistrfs,
+vistrh,
+vistrhs,
+vlcb,
+vlcf,
+vlcg,
+vlch,
+vldeb,
+vleb,
+vlebrf,
+vlebrg,
+vlebrh,
+vledb,
+vlef,
+vleg,
+vleh,
+vleib,
+vleif,
+vleig,
+vleih,
+vlpb,
+vlpf,
+vlpg,
+vlph,
+vmaeb,
+vmaef,
+vmaeh,
+vmahb,
+vmahf,
+vmahh,
+vmalb,
+vmaleb,
+vmalef,
+vmaleh,
+vmalf,
+vmalhb,
+vmalhf,
+vmalhh,
+vmalhw,
+vmalob,
+vmalof,
+vmaloh,
+vmaob,
+vmaof,
+vmaoh,
+vmeb,
+vmef,
+vmeh,
+vmhb,
+vmhf,
+vmhh,
+vmlb,
+vmleb,
+vmlef,
+vmleh,
+vmlf,
+vmlhb,
+vmlhf,
+vmlhh,
+vmlhw,
+vmlob,
+vmlof,
+vmloh,
+vmnb,
+vmnf,
+vmng,
+vmnh,
+vmnlb,
+vmnlf,
+vmnlg,
+vmnlh,
+vmob,
+vmof,
+vmoh,
+vmrhb,
+vmrhf,
+vmrhg,
+vmrhh,
+vmrlb,
+vmrlf,
+vmrlg,
+vmrlh,
+vmslg,
+vmxb,
+vmxf,
+vmxg,
+vmxh,
+vmxlb,
+vmxlf,
+vmxlg,
+vmxlh,
+vn,
+vnc,
+vnn,
+vno,
+vnot,
+vnx,
+vo,
+voc,
+vone,
+vpdi,
+vperm,
+vpkf,
+vpkg,
+vpkh,
+vpklsf,
+vpklsfs,
+vpklsg,
+vpklsgs,
+vpklsh,
+vpklshs,
+vpksf,
+vpksfs,
+vpksg,
+vpksgs,
+vpksh,
+vpkshs,
+vpopct,
+vpopctb,
+vpopctf,
+vpopctg,
+vpopcth,
+vrepb,
+vrepf,
+vrepg,
+vreph,
+vrepi,
+vrepib,
+vrepif,
+vrepig,
+vrepih,
+vsb,
+vsbcbiq,
+vsbiq,
+vscbib,
+vscbif,
+vscbig,
+vscbih,
+vscbiq,
+vsegb,
+vsegf,
+vsegh,
+vsel,
+vsf,
+vsg,
+vsh,
+vsl,
+vslb,
+vsld,
+vsldb,
+vsq,
+vsra,
+vsrab,
+vsrd,
+vsrl,
+vsrlb,
+vsumb,
+vsumgf,
+vsumgh,
+vsumh,
+vsumqf,
+vsumqg,
+vtm,
+vuphb,
+vuphf,
+vuphh,
+vuplb,
+vuplf,
+vuplhb,
+vuplhf,
+vuplhh,
+vuplhw,
+vupllb,
+vupllf,
+vupllh,
+vx,
+vzero,
+wcdgb,
+wcdlgb,
+wcefb,
+wcelfb,
+wcfeb,
+wcgdb,
+wclfeb,
+wclgdb,
+wfadb,
+wfasb,
+wfaxb,
+wfcdb,
+wfcedb,
+wfcesb,
+wfcexb,
+wfcexbs,
+wfchdb,
+wfchedb,
+wfchesb,
+wfchexb,
+wfchexbs,
+wfchsb,
+wfchxb,
+wfchxbs,
+wfcsb,
+wfcxb,
+wfidb,
+wfisb,
+wfixb,
+wfkdb,
+wfkedb,
+wfkesb,
+wfkexb,
+wfkhdb,
+wfkhedb,
+wfkhesb,
+wfkhexb,
+wfkhsb,
+wfkhxb,
+wfksb,
+wfkxb,
+wflcdb,
+wflcsb,
+wflcxb,
+wflld,
+wflndb,
+wflnsb,
+wflnxb,
+wflpdb,
+wflpsb,
+wflpxb,
+wflrx,
+wfmadb,
+wfmasb,
+wfmaxb,
+wfmaxxb,
+wfmdb,
+wfminxb,
+wfmsb,
+wfmsdb,
+wfmssb,
+wfmsxb,
+wfmxb,
+wfnmaxb,
+wfnmsxb,
+wfsdb,
+wfssb,
+wfsxb,
+wftcixb,
+wldeb,
+wledb"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_attr "z16_cracked" ""
+(cond [(eq_attr "mnemonic"
+"bas,
+basr,
+cdfbr,
+cdftr,
+cdgbr,
+cdgtr,
+cdlfbr,
+cdlftr,
+cdlgbr,
+cdlgtr,
+cefbr,
+cegbr,
+celfbr,
+celgbr,
+cfdbr,
+cfebr,
+cfxbr,
+cgdbr,
+cgdtr,
+cgebr,
+cgxbr,
+cgxtr,
+chhsi,
+clfdbr,
+clfdtr,
+clfebr,
+clfxbr,
+clfxtr,
+clgdbr,
+clgdtr,
+clgebr,
+clgxbr,
+clgxtr,
+cs,
+csg,
+csy,
+d,
+efpc,
+ex,
+lcgfr,
+lngfr,
+lpgfr,
+lpq,
+lxr,
+lzxr,
+rxsbg,
+stpq,
+vgef,
+vgeg,
+vscef,
+vsceg,
+vsteb,
+vstebrh,
+vsteh"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_attr "z16_expanded" ""
+(cond [(eq_attr "mnemonic"
+"cds,
+cdsg,
+cdsy,
+cxfbr,
+cxftr,
+cxgbr,
+cxgtr,
+cxlfbr,
+cxlftr,
+cxlgbr,
+cxlgtr,
+dl,
+dlg,
+dsg,
+dsgf,
+lam,
+lm,
+lmg,
+lmy,
+sldl,
+srda,
+srdl,
+stam,
+stm,
+stmg,
+stmy,
+tbegin,
+tbeginc"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_attr "z16_groupalone" ""
+(cond [(eq_attr "mnemonic"
+"alc,
+alcg,
+alcgr,
+alcr,
+axbr,
+axtr,
+clc,
+cxbr,
+cxtr,
+dlgr,
+dlr,
+dr,
+dsgfr,
+dsgr,
+dxbr,
+dxtr,
+fixbr,
+fixbra,
+fixtr,
+flogr,
+kxbr,
+kxtr,
+lcxbr,
+lnxbr,
+lpxbr,
+ltxbr,
+ltxtr,
+lxdb,
+lxdbr,
+lxdtr,
+lxeb,
+lxebr,
+m,
+madb,
+maeb,
+maebr,
+mfy,
+mg,
+mgrk,
+ml,
+mlg,
+mlgr,
+mlr,
+mr,
+msdb,
+mseb,
+msebr,
+mvc,
+mxbr,
+mxtr,
+nc,
+oc,
+ppa,
+sfpc,
+slb,
+slbg,
+slbgr,
+slbr,
+sqxbr,
+sxbr,
+sxtr,
+tabort,
+tcxb,
+tdcxt,
+tend,
+xc"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_attr "z16_endgroup" ""
+(cond [(eq_attr "mnemonic"
+"bras,
+brasl,
+exrl,
+ipm"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_attr "z16_groupoftwo" ""
+(cond [(eq_attr "mnemonic"
+"vacccq,
+vacq,
+vfmadb,
+vfmasb,
+vfmsdb,
+vfmssb,
+vfnmadb,
+vfnmasb,
+vfnmsdb,
+vfnmssb,
+vgfmab,
+vgfmaf,
+vgfmag,
+vgfmah,
+vmaeb,
+vmaef,
+vmaeh,
+vmahb,
+vmahf,
+vmahh,
+vmalb,
+vmaleb,
+vmalef,
+vmaleh,
+vmalf,
+vmalhb,
+vmalhf,
+vmalhh,
+vmalhw,
+vmalob,
+vmalof,
+vmaloh,
+vmaob,
+vmaof,
+vmaoh,
+vmslg,
+vperm,
+vsbcbiq,
+vsbiq,
+vsel,
+wfmadb,
+wfmasb,
+wfmaxb,
+wfmsdb,
+wfmssb,
+wfmsxb,
+wfnmaxb,
+wfnmsxb"
+)
+ (const_int 1)] (const_int 0)))
+
+(define_insn_reservation "z16_0" 0
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"a,
+afi,
+ag,
+agfi,
+aghi,
+aghik,
+agr,
+agrk,
+ahi,
+ahik,
+al,
+alfi,
+alg,
+algf,
+algfi,
+algfr,
+alghsik,
+algr,
+algrk,
+alhsik,
+alr,
+alrk,
+aly,
+ar,
+ark,
+ay,
+b,
+bc,
+bcr,
+bi,
+br,
+bras,
+brasl,
+brcl,
+c,
+cfi,
+cg,
+cgfi,
+cghi,
+cghsi,
+cgit,
+cgr,
+cgrl,
+cgrt,
+chi,
+chsi,
+cit,
+cl,
+clfhsi,
+clfi,
+clfit,
+clg,
+clgf,
+clgfi,
+clgfr,
+clgfrl,
+clghrl,
+clghsi,
+clgit,
+clgr,
+clgrl,
+clgrt,
+clgt,
+clhhsi,
+clhrl,
+cli,
+cliy,
+clr,
+clrl,
+clrt,
+clt,
+cly,
+cr,
+crl,
+crt,
+cy,
+etnd,
+exrl,
+ic,
+icm,
+icmh,
+icmy,
+icy,
+iihf,
+iilf,
+j,
+jg,
+la,
+larl,
+lat,
+lay,
+lb,
+lbr,
+lcdfr,
+lcgr,
+lcr,
+ldgr,
+ldr,
+lgat,
+lgb,
+lgbr,
+lgf,
+lgfi,
+lgfr,
+lgfrl,
+lgh,
+lghi,
+lghr,
+lghrl,
+lgr,
+lh,
+lhi,
+lhr,
+lhrl,
+lhy,
+llcr,
+llgcr,
+llgfr,
+llghr,
+llgtr,
+llhr,
+llihf,
+llihh,
+llihl,
+llilf,
+llilh,
+llill,
+lndfr,
+lngr,
+lnr,
+lpdfr,
+lpgr,
+lpr,
+lr,
+lrv,
+lrvg,
+lrvgr,
+lrvh,
+lrvr,
+lt,
+ltg,
+ltgf,
+ltgfr,
+ltgr,
+ltr,
+lzdr,
+lzer,
+n,
+ncgrk,
+ncrk,
+ng,
+ngr,
+ngrk,
+nihf,
+nihh,
+nihl,
+nilf,
+nilh,
+nill,
+nngrk,
+nnrk,
+nogrk,
+nop,
+nopr,
+nork,
+nr,
+nrk,
+nxgrk,
+nxrk,
+ny,
+o,
+ocgrk,
+ocrk,
+og,
+ogr,
+ogrk,
+oihf,
+oihh,
+oihl,
+oilf,
+oilh,
+oill,
+or,
+ork,
+oy,
+pfpo,
+risbg,
+risbgn,
+rll,
+rllg,
+rnsbg,
+rosbg,
+s,
+sg,
+sgr,
+sgrk,
+sl,
+sldl,
+slfi,
+slg,
+slgf,
+slgfi,
+slgfr,
+slgr,
+slgrk,
+sll,
+sllg,
+sllk,
+slr,
+slrk,
+sly,
+sr,
+sra,
+srag,
+srak,
+srda,
+srdl,
+srk,
+srl,
+srlg,
+srlk,
+sy,
+tm,
+tmh,
+tmhh,
+tmhl,
+tml,
+tmlh,
+tmll,
+tmy,
+vlr,
+vlvgb,
+vlvgf,
+vlvgg,
+vlvgh,
+x,
+xg,
+xgr,
+xgrk,
+xihf,
+xilf,
+xr,
+xrk,
+xy"
+)) "nothing")
+
+(define_insn_reservation "z16_1" 1
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"agf,
+agfr,
+agh,
+agsi,
+ah,
+ahy,
+algsi,
+alsi,
+asi,
+cgf,
+cgfr,
+cgfrl,
+cgh,
+cghrl,
+ch,
+chrl,
+chy,
+clm,
+clmy,
+cpsdr,
+laa,
+laag,
+lan,
+lang,
+lao,
+laog,
+lax,
+laxg,
+le,
+ler,
+ley,
+loc,
+locg,
+locghi,
+locgr,
+lochi,
+locr,
+mvghi,
+mvhhi,
+mvhi,
+mvi,
+mviy,
+ni,
+niy,
+ntstg,
+oi,
+oiy,
+selgr,
+selr,
+sgf,
+sgfr,
+sgh,
+sh,
+shy,
+st,
+stc,
+stcy,
+stg,
+stgrl,
+sth,
+sthrl,
+sthy,
+stoc,
+stocg,
+strl,
+strv,
+strvg,
+strvh,
+sty,
+vab,
+vaccb,
+vacccq,
+vaccf,
+vaccg,
+vacch,
+vaccq,
+vacq,
+vaf,
+vag,
+vah,
+vaq,
+vavgb,
+vavgf,
+vavgg,
+vavgh,
+vavglb,
+vavglf,
+vavglg,
+vavglh,
+vbperm,
+vceqb,
+vceqf,
+vceqg,
+vceqh,
+vcfn,
+vchb,
+vchf,
+vchg,
+vchh,
+vchlb,
+vchlf,
+vchlg,
+vchlh,
+vclfnh,
+vclfnl,
+vclzb,
+vclzf,
+vclzg,
+vclzh,
+vcnf,
+vcrnf,
+vctzb,
+vctzf,
+vctzg,
+vctzh,
+verimb,
+verimf,
+verimg,
+verimh,
+verllb,
+verllf,
+verllg,
+verllh,
+verllvb,
+verllvf,
+verllvg,
+verllvh,
+veslb,
+veslf,
+veslg,
+veslh,
+veslvb,
+veslvf,
+veslvg,
+veslvh,
+vesrab,
+vesraf,
+vesrag,
+vesrah,
+vesravb,
+vesravf,
+vesravg,
+vesravh,
+vesrlb,
+vesrlf,
+vesrlg,
+vesrlh,
+vesrlvb,
+vesrlvf,
+vesrlvg,
+vesrlvh,
+vfcedb,
+vfcesb,
+vfchdb,
+vfchedb,
+vfchesb,
+vfchsb,
+vfkedb,
+vfkesb,
+vfkhdb,
+vfkhedb,
+vfkhesb,
+vfkhsb,
+vflcdb,
+vflcsb,
+vflndb,
+vflnsb,
+vflpdb,
+vflpsb,
+vfmaxdb,
+vfmaxsb,
+vfmindb,
+vfminsb,
+vgbm,
+vgm,
+vgmb,
+vgmf,
+vgmg,
+vgmh,
+vlcb,
+vlcf,
+vlcg,
+vlch,
+vleb,
+vlebrf,
+vlebrg,
+vlebrh,
+vlef,
+vleg,
+vleh,
+vleib,
+vleif,
+vleig,
+vleih,
+vlpb,
+vlpf,
+vlpg,
+vlph,
+vmnb,
+vmnf,
+vmng,
+vmnh,
+vmnlb,
+vmnlf,
+vmnlg,
+vmnlh,
+vmrhb,
+vmrhf,
+vmrhg,
+vmrhh,
+vmrlb,
+vmrlf,
+vmrlg,
+vmrlh,
+vmxb,
+vmxf,
+vmxg,
+vmxh,
+vmxlb,
+vmxlf,
+vmxlg,
+vmxlh,
+vn,
+vnc,
+vnn,
+vno,
+vnot,
+vnx,
+vo,
+voc,
+vone,
+vpdi,
+vperm,
+vpkf,
+vpkg,
+vpkh,
+vpklsf,
+vpklsg,
+vpklsh,
+vpksf,
+vpksg,
+vpksh,
+vpopct,
+vpopctb,
+vpopctf,
+vpopctg,
+vpopcth,
+vrepb,
+vrepf,
+vrepg,
+vreph,
+vrepi,
+vrepib,
+vrepif,
+vrepig,
+vrepih,
+vsb,
+vsbcbiq,
+vsbiq,
+vscbib,
+vscbif,
+vscbig,
+vscbih,
+vscbiq,
+vsegb,
+vsegf,
+vsegh,
+vsel,
+vsf,
+vsg,
+vsh,
+vsl,
+vslb,
+vsld,
+vsldb,
+vsq,
+vsra,
+vsrab,
+vsrd,
+vsrl,
+vsrlb,
+vuphb,
+vuphf,
+vuphh,
+vuplb,
+vuplf,
+vuplhb,
+vuplhf,
+vuplhh,
+vuplhw,
+vupllb,
+vupllf,
+vupllh,
+vx,
+vzero,
+wfcedb,
+wfcesb,
+wfcexb,
+wfchdb,
+wfchedb,
+wfchesb,
+wfchexb,
+wfchsb,
+wfchxb,
+wfkedb,
+wfkesb,
+wfkexb,
+wfkhdb,
+wfkhedb,
+wfkhesb,
+wfkhexb,
+wfkhsb,
+wfkhxb,
+wflcdb,
+wflcsb,
+wflcxb,
+wflndb,
+wflnsb,
+wflnxb,
+wflpdb,
+wflpsb,
+wflpxb,
+wfmaxxb,
+wfminxb,
+xi,
+xiy"
+)) "nothing")
+
+(define_insn_reservation "z16_2" 2
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"cdb,
+cdbr,
+ceb,
+cebr,
+ear,
+ipm,
+kdb,
+kdbr,
+keb,
+kebr,
+l,
+lcbb,
+lcdbr,
+lcebr,
+ld,
+lde,
+ldy,
+lg,
+lgdr,
+lgrl,
+llc,
+llgc,
+llgf,
+llgfrl,
+llgh,
+llghrl,
+llgt,
+llh,
+llhrl,
+lm,
+lmg,
+lmy,
+lndbr,
+lnebr,
+lpdbr,
+lpebr,
+lrl,
+ltdbr,
+ltebr,
+ly,
+popcnt,
+sar,
+tcdb,
+tceb,
+vceqbs,
+vceqfs,
+vceqgs,
+vceqhs,
+vchbs,
+vchfs,
+vchgs,
+vchhs,
+vchlbs,
+vchlfs,
+vchlgs,
+vchlhs,
+vfcedbs,
+vfcesbs,
+vfchdbs,
+vfchedbs,
+vfchesbs,
+vfchsbs,
+vfeeb,
+vfeef,
+vfeeh,
+vfeneb,
+vfenef,
+vfeneh,
+vfenezb,
+vfenezf,
+vfenezh,
+vftcidb,
+vftcisb,
+vistrb,
+vistrf,
+vistrh,
+vlbrrepf,
+vlbrrepg,
+vlbrreph,
+vlgvb,
+vlgvf,
+vlgvg,
+vlgvh,
+vllebrzf,
+vllebrzg,
+vllebrzh,
+vllezb,
+vllezf,
+vllezg,
+vllezh,
+vllezlf,
+vlrepb,
+vlrepf,
+vlrepg,
+vlreph,
+vlrl,
+vlvgp,
+vpklsfs,
+vpklsgs,
+vpklshs,
+vpksfs,
+vpksgs,
+vpkshs,
+wfcdb,
+wfcexbs,
+wfchexbs,
+wfchxbs,
+wfcsb,
+wfcxb,
+wfkdb,
+wfksb,
+wfkxb,
+wftcixb"
+)) "nothing")
+
+(define_insn_reservation "z16_3" 3
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"cds,
+cdsy,
+mgh,
+mghi,
+mh,
+mhi,
+mhy,
+std,
+stdy,
+ste,
+stey,
+vcksm,
+vfeezbs,
+vfeezfs,
+vfeezhs,
+vgfmab,
+vgfmaf,
+vgfmag,
+vgfmah,
+vgfmb,
+vgfmf,
+vgfmg,
+vgfmh,
+vistrbs,
+vistrfs,
+vistrhs,
+vl,
+vlbb,
+vlbr,
+vlbrf,
+vlbrg,
+vlbrh,
+vlbrq,
+vlerf,
+vlerg,
+vlerh,
+vll,
+vlrlr,
+vmaeb,
+vmaef,
+vmaeh,
+vmahb,
+vmahf,
+vmahh,
+vmalb,
+vmaleb,
+vmalef,
+vmaleh,
+vmalf,
+vmalhb,
+vmalhf,
+vmalhh,
+vmalhw,
+vmalob,
+vmalof,
+vmaloh,
+vmaob,
+vmaof,
+vmaoh,
+vmeb,
+vmef,
+vmeh,
+vmhb,
+vmhf,
+vmhh,
+vmlb,
+vmleb,
+vmlef,
+vmleh,
+vmlf,
+vmlhb,
+vmlhf,
+vmlhh,
+vmlhw,
+vmlob,
+vmlof,
+vmloh,
+vmob,
+vmof,
+vmoh,
+vsumb,
+vsumgf,
+vsumgh,
+vsumh,
+vsumqf,
+vsumqg,
+vtm"
+)) "nothing")
+
+(define_insn_reservation "z16_4" 4
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"bas,
+basr,
+chhsi,
+clc,
+ex,
+lam,
+lcgfr,
+lngfr,
+lpgfr,
+lxr,
+lzxr,
+ms,
+msfi,
+msgf,
+msgfi,
+msgfr,
+msr,
+msy,
+mvc,
+nc,
+oc,
+ppa,
+rxsbg,
+tabort,
+tbegin,
+tbeginc,
+tend,
+vst,
+vstbr,
+vstbrf,
+vstbrg,
+vstbrh,
+vstbrq,
+vstebrf,
+vstebrg,
+vstef,
+vsteg,
+vsterf,
+vsterg,
+vsterh,
+vstl,
+vstrl,
+vstrlr,
+xc"
+)) "nothing")
+
+(define_insn_reservation "z16_5" 5
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"adb,
+adbr,
+aeb,
+aebr,
+alc,
+alcg,
+alcgr,
+alcr,
+cs,
+csg,
+csy,
+fidbr,
+fidbra,
+fiebr,
+fiebra,
+ldeb,
+ldebr,
+ledbr,
+madbr,
+mdb,
+mdbr,
+meeb,
+meebr,
+msc,
+msdbr,
+msrkc,
+sdb,
+sdbr,
+seb,
+sebr,
+slb,
+slbg,
+slbgr,
+slbr,
+stm,
+stmg,
+stmy,
+vcdgb,
+vcdlgb,
+vcefb,
+vcelfb,
+vcfeb,
+vcgdb,
+vclfeb,
+vclgdb,
+vfadb,
+vfasb,
+vfidb,
+vfisb,
+vfmadb,
+vfmasb,
+vfmdb,
+vfmsb,
+vfmsdb,
+vfmssb,
+vfnmadb,
+vfnmasb,
+vfnmsdb,
+vfnmssb,
+vfsdb,
+vfssb,
+vldeb,
+vledb,
+vmslg,
+wcdgb,
+wcdlgb,
+wcefb,
+wcelfb,
+wcfeb,
+wcgdb,
+wclfeb,
+wclgdb,
+wfadb,
+wfasb,
+wfidb,
+wfisb,
+wflld,
+wfmadb,
+wfmasb,
+wfmdb,
+wfmsb,
+wfmsdb,
+wfmssb,
+wfsdb,
+wfssb,
+wldeb,
+wledb"
+)) "nothing")
+
+(define_insn_reservation "z16_6" 6
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"msg,
+msgr,
+sfpc"
+)) "nothing")
+
+(define_insn_reservation "z16_7" 7
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"adtr,
+cdtr,
+fidtr,
+kdtr,
+ldetr,
+ltdtr,
+msgc,
+msgrkc,
+sdtr,
+tdcdt,
+tdcet,
+vgef,
+vgeg"
+)) "nothing")
+
+(define_insn_reservation "z16_8" 8
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"cdsg,
+flogr,
+lpq,
+stpq,
+vsteb,
+vstebrh,
+vsteh"
+)) "nothing")
+
+(define_insn_reservation "z16_9" 9
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"cdfbr,
+cdgbr,
+cdlfbr,
+cdlgbr,
+cefbr,
+cegbr,
+celfbr,
+celgbr,
+cxfbr,
+cxgbr,
+cxlfbr,
+cxlgbr,
+m,
+madb,
+maeb,
+maebr,
+mfy,
+ml,
+mlr,
+mr,
+msdb,
+mseb,
+msebr,
+stam,
+wfaxb,
+wfixb,
+wflrx,
+wfsxb"
+)) "nothing")
+
+(define_insn_reservation "z16_10" 10
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"lxdb,
+lxdbr,
+lxeb,
+lxebr,
+vscef,
+vsceg"
+)) "nothing")
+
+(define_insn_reservation "z16_11" 11
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"cfdbr,
+cfebr,
+cgdbr,
+cgebr,
+clfdbr,
+clfebr,
+clgdbr,
+clgebr,
+mg,
+mgrk,
+mlg,
+mlgr"
+)) "nothing")
+
+(define_insn_reservation "z16_12" 12
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"cxbr,
+cxftr,
+cxlftr,
+cxtr,
+kxbr,
+kxtr,
+tcxb,
+tdcxt"
+)) "nothing")
+
+(define_insn_reservation "z16_13" 13
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"axbr,
+axtr,
+fixbr,
+fixbra,
+fixtr,
+lcxbr,
+lnxbr,
+lpxbr,
+ltxbr,
+ltxtr,
+lxdtr,
+sxbr,
+sxtr"
+)) "nothing")
+
+(define_insn_reservation "z16_14" 14
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"cfxbr,
+cgxbr,
+clfxbr,
+clgxbr,
+ledtr"
+)) "nothing")
+
+(define_insn_reservation "z16_16" 16
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"cdftr,
+cdlftr"
+)) "nothing")
+
+(define_insn_reservation "z16_20" 20
+  (and (eq_attr "cpu" "z16")
+(eq_attr "mnemonic"
+"cdgtr,
+cdlgtr,
+cgdtr,
+cgxtr,
+clfdtr,
+clfxtr,
+clgdtr,
+clgxtr,
+cxgtr,
+cxlgtr,
+d,
+ddb,
+ddbr,
+ddtr,
+deb,
+debr,
+dl,
+dlg,
+dlgr,
+dlr,
+dr,
+dsg,
+dsgf,
+dsgfr,
+dsgr,
+dxbr,
+dxtr,
+efpc,
+mdtr,
+mxbr,
+mxtr,
+sqdb,
+sqdbr,
+sqeb,
+sqebr,
+sqxbr,
+vfddb,
+vfdsb,
+vfsqdb,
+vfsqsb,
+wfddb,
+wfdsb,
+wfdxb,
+wfmaxb,
+wfmsxb,
+wfmxb,
+wfnmaxb,
+wfnmsxb,
+wfsqdb,
+wfsqxb"
+)) "nothing")
+
diff --git a/gcc/config/s390/driver-native.cc b/gcc/config/s390/driver-native.cc
index 48524c4..b5eb222 100644
--- a/gcc/config/s390/driver-native.cc
+++ b/gcc/config/s390/driver-native.cc
@@ -123,8 +123,12 @@ s390_host_detect_local_cpu (int argc, const char **argv)
 	    case 0x8562:
 	      cpu = "z15";
 	      break;
+	    case 0x3931:
+	    case 0x3932:
+	      cpu = "z16";
+	      break;
 	    default:
-	      cpu = "arch14";
+	      cpu = "z16";
 	      break;
 	    }
 	}
diff --git a/gcc/config/s390/s390-opts.h b/gcc/config/s390/s390-opts.h
index 1ec8463..4ef82ac 100644
--- a/gcc/config/s390/s390-opts.h
+++ b/gcc/config/s390/s390-opts.h
@@ -38,7 +38,7 @@ enum processor_type
   PROCESSOR_2964_Z13,
   PROCESSOR_3906_Z14,
   PROCESSOR_8561_Z15,
-  PROCESSOR_ARCH14,
+  PROCESSOR_3931_Z16,
   PROCESSOR_NATIVE,
   PROCESSOR_max
 };
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index e625159..fd4acaa 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -49,7 +49,6 @@ extern void s390_function_profiler (FILE *, int);
 extern void s390_set_has_landing_pad_p (bool);
 extern bool s390_hard_regno_rename_ok (unsigned int, unsigned int);
 extern int s390_class_max_nregs (enum reg_class, machine_mode);
-extern bool s390_function_arg_vector (machine_mode, const_tree);
 extern bool s390_return_addr_from_memory(void);
 extern bool s390_fma_allowed_p (machine_mode);
 #if S390_USE_TARGET_ATTRIBUTE
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index d2af6d8..444b1ec 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -70,6 +70,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "debug.h"
 #include "langhooks.h"
 #include "internal-fn.h"
+#include "gimple-iterator.h"
 #include "gimple-fold.h"
 #include "tree-eh.h"
 #include "gimplify.h"
@@ -337,7 +338,7 @@ const struct s390_processor processor_table[] =
   { "z13",    "z13",    PROCESSOR_2964_Z13,    &zEC12_cost,  11 },
   { "z14",    "arch12", PROCESSOR_3906_Z14,    &zEC12_cost,  12 },
   { "z15",    "arch13", PROCESSOR_8561_Z15,    &zEC12_cost,  13 },
-  { "arch14", "arch14", PROCESSOR_ARCH14,      &zEC12_cost,  14 },
+  { "z16",    "arch14", PROCESSOR_3931_Z16,    &zEC12_cost,  14 },
   { "native", "",       PROCESSOR_NATIVE,      NULL,         0  }
 };
 
@@ -853,12 +854,6 @@ s390_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
 	  error ("Builtin %qF requires z15 or higher", fndecl);
 	  return const0_rtx;
 	}
-
-      if ((bflags & B_NNPA) && !TARGET_NNPA)
-	{
-	  error ("Builtin %qF requires arch14 or higher.", fndecl);
-	  return const0_rtx;
-	}
     }
   if (fcode >= S390_OVERLOADED_BUILTIN_VAR_OFFSET
       && fcode < S390_ALL_BUILTIN_MAX)
@@ -8525,7 +8520,7 @@ s390_issue_rate (void)
     case PROCESSOR_2827_ZEC12:
     case PROCESSOR_2964_Z13:
     case PROCESSOR_3906_Z14:
-    case PROCESSOR_ARCH14:
+    case PROCESSOR_3931_Z16:
     default:
       return 1;
     }
@@ -8774,7 +8769,7 @@ static machine_mode constant_modes[] =
   QImode,
   V1QImode
 };
-#define NR_C_MODES (sizeof (constant_modes) / sizeof (constant_modes[0]))
+#define NR_C_MODES (ARRAY_SIZE (constant_modes))
 
 struct constant
 {
@@ -12148,29 +12143,26 @@ s390_function_arg_size (machine_mode mode, const_tree type)
   gcc_unreachable ();
 }
 
-/* Return true if a function argument of type TYPE and mode MODE
-   is to be passed in a vector register, if available.  */
+/* Return true if a variable of TYPE should be passed as single value
+   with type CODE. If STRICT_SIZE_CHECK_P is true the sizes of the
+   record type and the field type must match.
 
-bool
-s390_function_arg_vector (machine_mode mode, const_tree type)
+   The ABI says that record types with a single member are treated
+   just like that member would be.  This function is a helper to
+   detect such cases.  The function also produces the proper
+   diagnostics for cases where the outcome might be different
+   depending on the GCC version.  */
+static bool
+s390_single_field_struct_p (enum tree_code code, const_tree type,
+			    bool strict_size_check_p)
 {
-  if (!TARGET_VX_ABI)
-    return false;
-
-  if (s390_function_arg_size (mode, type) > 16)
-    return false;
-
-  /* No type info available for some library calls ...  */
-  if (!type)
-    return VECTOR_MODE_P (mode);
-
-  /* The ABI says that record types with a single member are treated
-     just like that member would be.  */
   int empty_base_seen = 0;
+  bool zero_width_bf_skipped_p = false;
   const_tree orig_type = type;
+
   while (TREE_CODE (type) == RECORD_TYPE)
     {
-      tree field, single = NULL_TREE;
+      tree field, single_type = NULL_TREE;
 
       for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
 	{
@@ -12187,48 +12179,108 @@ s390_function_arg_vector (machine_mode mode, const_tree type)
 	      continue;
 	    }
 
-	  if (single == NULL_TREE)
-	    single = TREE_TYPE (field);
+	  if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
+	    {
+	      zero_width_bf_skipped_p = true;
+	      continue;
+	    }
+
+	  if (single_type == NULL_TREE)
+	    single_type = TREE_TYPE (field);
 	  else
 	    return false;
 	}
 
-      if (single == NULL_TREE)
+      if (single_type == NULL_TREE)
 	return false;
-      else
-	{
-	  /* If the field declaration adds extra byte due to
-	     e.g. padding this is not accepted as vector type.  */
-	  if (int_size_in_bytes (single) <= 0
-	      || int_size_in_bytes (single) != int_size_in_bytes (type))
-	    return false;
-	  type = single;
-	}
+
+      /* Reaching this point we have a struct with a single member and
+	 zero or more zero-sized bit-fields which have been skipped in the
+	 past.  */
+
+      /* If ZERO_WIDTH_BF_SKIPPED_P then the struct will not be accepted.  In case
+	 we are not supposed to emit a warning exit early.  */
+      if (zero_width_bf_skipped_p && !warn_psabi)
+	return false;
+
+      /* If the field declaration adds extra bytes due to padding this
+	 is not accepted with STRICT_SIZE_CHECK_P.  */
+      if (strict_size_check_p
+	  && (int_size_in_bytes (single_type) <= 0
+	      || int_size_in_bytes (single_type) != int_size_in_bytes (type)))
+	return false;
+
+      type = single_type;
     }
 
-  if (!VECTOR_TYPE_P (type))
+  if (TREE_CODE (type) != code)
     return false;
 
-  if (warn_psabi && empty_base_seen)
+  if (warn_psabi)
     {
-      static unsigned last_reported_type_uid;
       unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (orig_type));
-      if (uid != last_reported_type_uid)
-	{
-	  const char *url = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
-	  last_reported_type_uid = uid;
-	  if (empty_base_seen & 1)
-	    inform (input_location,
-		    "parameter passing for argument of type %qT when C++17 "
-		    "is enabled changed to match C++14 %{in GCC 10.1%}",
-		    orig_type, url);
-	  else
-	    inform (input_location,
-		    "parameter passing for argument of type %qT with "
-		    "%<[[no_unique_address]]%> members changed "
-		    "%{in GCC 10.1%}", orig_type, url);
+
+      if (empty_base_seen)
+	{
+	  static unsigned last_reported_type_uid_empty_base;
+	  if (uid != last_reported_type_uid_empty_base)
+	    {
+	      last_reported_type_uid_empty_base = uid;
+	      const char *url = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
+	      if (empty_base_seen & 1)
+		inform (input_location,
+			"parameter passing for argument of type %qT when C++17 "
+			"is enabled changed to match C++14 %{in GCC 10.1%}",
+			orig_type, url);
+	      else
+		inform (input_location,
+			"parameter passing for argument of type %qT with "
+			"%<[[no_unique_address]]%> members changed "
+			"%{in GCC 10.1%}", orig_type, url);
+	    }
+	}
+
+      /* For C++ older GCCs ignored zero width bitfields and therefore
+	 passed structs more often as single values than GCC 12 does.
+	 So diagnostics are only required in cases where we do NOT
+	 accept the struct to be passed as single value.  */
+      if (zero_width_bf_skipped_p)
+	{
+	  static unsigned last_reported_type_uid_zero_width;
+	  if (uid != last_reported_type_uid_zero_width)
+	    {
+	      last_reported_type_uid_zero_width = uid;
+	      inform (input_location,
+		      "parameter passing for argument of type %qT with "
+		      "zero-width bit fields members changed in GCC 12",
+		      orig_type);
+	    }
 	}
     }
+
+  return !zero_width_bf_skipped_p;
+}
+
+
+/* Return true if a function argument of type TYPE and mode MODE
+   is to be passed in a vector register, if available.  */
+
+static bool
+s390_function_arg_vector (machine_mode mode, const_tree type)
+{
+  if (!TARGET_VX_ABI)
+    return false;
+
+  if (s390_function_arg_size (mode, type) > 16)
+    return false;
+
+  /* No type info available for some library calls ...  */
+  if (!type)
+    return VECTOR_MODE_P (mode);
+
+  if (!s390_single_field_struct_p (VECTOR_TYPE, type, true))
+    return false;
+
   return true;
 }
 
@@ -12249,64 +12301,9 @@ s390_function_arg_float (machine_mode mode, const_tree type)
   if (!type)
     return mode == SFmode || mode == DFmode || mode == SDmode || mode == DDmode;
 
-  /* The ABI says that record types with a single member are treated
-     just like that member would be.  */
-  int empty_base_seen = 0;
-  const_tree orig_type = type;
-  while (TREE_CODE (type) == RECORD_TYPE)
-    {
-      tree field, single = NULL_TREE;
-
-      for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
-	{
-	  if (TREE_CODE (field) != FIELD_DECL)
-	    continue;
-	  if (DECL_FIELD_ABI_IGNORED (field))
-	    {
-	      if (lookup_attribute ("no_unique_address",
-				    DECL_ATTRIBUTES (field)))
-		empty_base_seen |= 2;
-	      else
-		empty_base_seen |= 1;
-	      continue;
-	    }
-
-	  if (single == NULL_TREE)
-	    single = TREE_TYPE (field);
-	  else
-	    return false;
-	}
-
-      if (single == NULL_TREE)
-	return false;
-      else
-	type = single;
-    }
-
-  if (TREE_CODE (type) != REAL_TYPE)
+  if (!s390_single_field_struct_p (REAL_TYPE, type, false))
     return false;
 
-  if (warn_psabi && empty_base_seen)
-    {
-      static unsigned last_reported_type_uid;
-      unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (orig_type));
-      if (uid != last_reported_type_uid)
-	{
-	  const char *url = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
-	  last_reported_type_uid = uid;
-	  if (empty_base_seen & 1)
-	    inform (input_location,
-		    "parameter passing for argument of type %qT when C++17 "
-		    "is enabled changed to match C++14 %{in GCC 10.1%}",
-		    orig_type, url);
-	  else
-	    inform (input_location,
-		    "parameter passing for argument of type %qT with "
-		    "%<[[no_unique_address]]%> members changed "
-		    "%{in GCC 10.1%}", orig_type, url);
-	}
-    }
-
   return true;
 }
 
@@ -14879,7 +14876,6 @@ s390_get_sched_attrmask (rtx_insn *insn)
 	mask |= S390_SCHED_ATTR_MASK_GROUPOFTWO;
       break;
     case PROCESSOR_8561_Z15:
-    case PROCESSOR_ARCH14:
       if (get_attr_z15_cracked (insn))
 	mask |= S390_SCHED_ATTR_MASK_CRACKED;
       if (get_attr_z15_expanded (insn))
@@ -14891,6 +14887,18 @@ s390_get_sched_attrmask (rtx_insn *insn)
       if (get_attr_z15_groupoftwo (insn))
 	mask |= S390_SCHED_ATTR_MASK_GROUPOFTWO;
       break;
+    case PROCESSOR_3931_Z16:
+      if (get_attr_z16_cracked (insn))
+	mask |= S390_SCHED_ATTR_MASK_CRACKED;
+      if (get_attr_z16_expanded (insn))
+	mask |= S390_SCHED_ATTR_MASK_EXPANDED;
+      if (get_attr_z16_endgroup (insn))
+	mask |= S390_SCHED_ATTR_MASK_ENDGROUP;
+      if (get_attr_z16_groupalone (insn))
+	mask |= S390_SCHED_ATTR_MASK_GROUPALONE;
+      if (get_attr_z16_groupoftwo (insn))
+	mask |= S390_SCHED_ATTR_MASK_GROUPOFTWO;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -14927,7 +14935,6 @@ s390_get_unit_mask (rtx_insn *insn, int *units)
 	mask |= 1 << 3;
       break;
     case PROCESSOR_8561_Z15:
-    case PROCESSOR_ARCH14:
       *units = 4;
       if (get_attr_z15_unit_lsu (insn))
 	mask |= 1 << 0;
@@ -14938,6 +14945,17 @@ s390_get_unit_mask (rtx_insn *insn, int *units)
       if (get_attr_z15_unit_vfu (insn))
 	mask |= 1 << 3;
       break;
+    case PROCESSOR_3931_Z16:
+      *units = 4;
+      if (get_attr_z16_unit_lsu (insn))
+	mask |= 1 << 0;
+      if (get_attr_z16_unit_fxa (insn))
+	mask |= 1 << 1;
+      if (get_attr_z16_unit_fxb (insn))
+	mask |= 1 << 2;
+      if (get_attr_z16_unit_vfu (insn))
+	mask |= 1 << 3;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -14951,7 +14969,7 @@ s390_is_fpd (rtx_insn *insn)
     return false;
 
   return get_attr_z13_unit_fpd (insn) || get_attr_z14_unit_fpd (insn)
-    || get_attr_z15_unit_fpd (insn);
+    || get_attr_z15_unit_fpd (insn) || get_attr_z16_unit_fpd (insn);
 }
 
 static bool
@@ -14961,7 +14979,7 @@ s390_is_fxd (rtx_insn *insn)
     return false;
 
   return get_attr_z13_unit_fxd (insn) || get_attr_z14_unit_fxd (insn)
-    || get_attr_z15_unit_fxd (insn);
+    || get_attr_z15_unit_fxd (insn) || get_attr_z16_unit_fxd (insn);
 }
 
 /* Returns TRUE if INSN is a long-running instruction.  */
@@ -17157,9 +17175,13 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d)
    hook is supposed to emit the required INSNs.  */
 
 bool
-s390_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1,
+s390_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+			       rtx target, rtx op0, rtx op1,
 			       const vec_perm_indices &sel)
 {
+  if (vmode != op_mode)
+    return false;
+
   struct expand_vec_perm_d d;
   unsigned int i, nelt;
 
diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h
index 5a64048..2e1bc71 100644
--- a/gcc/config/s390/s390.h
+++ b/gcc/config/s390/s390.h
@@ -43,12 +43,12 @@ enum processor_flags
   PF_VXE2 = 8192,
   PF_Z15 = 16384,
   PF_NNPA = 32768,
-  PF_ARCH14 = 65536
+  PF_Z16 = 65536
 };
 
 /* This is necessary to avoid a warning about comparing different enum
    types.  */
-#define s390_tune_attr ((enum attr_cpu)(s390_tune > PROCESSOR_8561_Z15 ? PROCESSOR_8561_Z15 : s390_tune ))
+#define s390_tune_attr ((enum attr_cpu)(s390_tune > PROCESSOR_3931_Z16 ? PROCESSOR_3931_Z16 : s390_tune ))
 
 /* These flags indicate that the generated code should run on a cpu
    providing the respective hardware facility regardless of the
@@ -110,10 +110,10 @@ enum processor_flags
 	(s390_arch_flags & PF_VXE2)
 #define TARGET_CPU_VXE2_P(opts) \
 	(opts->x_s390_arch_flags & PF_VXE2)
-#define TARGET_CPU_ARCH14 \
-	(s390_arch_flags & PF_ARCH14)
-#define TARGET_CPU_ARCH14_P(opts) \
-	(opts->x_s390_arch_flags & PF_ARCH14)
+#define TARGET_CPU_Z16 \
+	(s390_arch_flags & PF_Z16)
+#define TARGET_CPU_Z16_P(opts) \
+	(opts->x_s390_arch_flags & PF_Z16)
 #define TARGET_CPU_NNPA \
 	(s390_arch_flags & PF_NNPA)
 #define TARGET_CPU_NNPA_P(opts) \
@@ -177,9 +177,9 @@ enum processor_flags
 	(TARGET_VX && TARGET_CPU_VXE2)
 #define TARGET_VXE2_P(opts)						\
 	(TARGET_VX_P (opts) && TARGET_CPU_VXE2_P (opts))
-#define TARGET_ARCH14 (TARGET_ZARCH && TARGET_CPU_ARCH14)
-#define TARGET_ARCH14_P(opts)						\
-	(TARGET_ZARCH_P (opts->x_target_flags) && TARGET_CPU_ARCH14_P (opts))
+#define TARGET_Z16 (TARGET_ZARCH && TARGET_CPU_Z16)
+#define TARGET_Z16_P(opts)						\
+	(TARGET_ZARCH_P (opts->x_target_flags) && TARGET_CPU_Z16_P (opts))
 #define TARGET_NNPA					\
 	(TARGET_ZARCH && TARGET_CPU_NNPA)
 #define TARGET_NNPA_P(opts)						\
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index d0f233e..55c0064 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -518,11 +518,11 @@
 ;; Processor type.  This attribute must exactly match the processor_type
 ;; enumeration in s390.h.
 
-(define_attr "cpu" "z900,z990,z9_109,z9_ec,z10,z196,zEC12,z13,z14,z15"
+(define_attr "cpu" "z900,z990,z9_109,z9_ec,z10,z196,zEC12,z13,z14,z15,z16"
   (const (symbol_ref "s390_tune_attr")))
 
 (define_attr "cpu_facility"
-  "standard,ieee,zarch,cpu_zarch,longdisp,extimm,dfp,z10,z196,zEC12,vx,z13,z14,vxe,z15,vxe2,arch14,nnpa"
+  "standard,ieee,zarch,cpu_zarch,longdisp,extimm,dfp,z10,z196,zEC12,vx,z13,z14,vxe,z15,vxe2,z16,nnpa"
   (const_string "standard"))
 
 (define_attr "enabled" ""
@@ -588,8 +588,8 @@
 	      (match_test "TARGET_VXE2"))
 	 (const_int 1)
 
-	 (and (eq_attr "cpu_facility" "arch14")
-	      (match_test "TARGET_ARCH14"))
+	 (and (eq_attr "cpu_facility" "z16")
+	      (match_test "TARGET_Z16"))
 	 (const_int 1)
 
          (and (eq_attr "cpu_facility" "nnpa")
@@ -629,6 +629,9 @@
 ;; Pipeline description for z15
 (include "8561.md")
 
+;; Pipeline description for z16
+(include "3931.md")
+
 ;; Predicates
 (include "predicates.md")
 
diff --git a/gcc/config/s390/s390.opt b/gcc/config/s390/s390.opt
index 5068486..9e8d3bf 100644
--- a/gcc/config/s390/s390.opt
+++ b/gcc/config/s390/s390.opt
@@ -116,7 +116,10 @@ EnumValue
 Enum(processor_type) String(arch13) Value(PROCESSOR_8561_Z15)
 
 EnumValue
-Enum(processor_type) String(arch14) Value(PROCESSOR_ARCH14)
+Enum(processor_type) String(arch14) Value(PROCESSOR_3931_Z16)
+
+EnumValue
+Enum(processor_type) String(z16) Value(PROCESSOR_3931_Z16)
 
 EnumValue
 Enum(processor_type) String(native) Value(PROCESSOR_NATIVE) DriverOnly
diff --git a/gcc/config/sh/sh.cc b/gcc/config/sh/sh.cc
index 8d40563..03e1c04 100644
--- a/gcc/config/sh/sh.cc
+++ b/gcc/config/sh/sh.cc
@@ -10762,6 +10762,12 @@ sh_register_move_cost (machine_mode mode,
       && ! REGCLASS_HAS_GENERAL_REG (dstclass))
     return 2 * ((GET_MODE_SIZE (mode) + 7) / 8U);
 
+  if (((dstclass == FP_REGS || dstclass == DF_REGS)
+       && (srcclass == PR_REGS))
+      || ((srcclass == FP_REGS || srcclass == DF_REGS)
+	  && (dstclass == PR_REGS)))
+    return 7;
+
   return 2 * ((GET_MODE_SIZE (mode) + 3) / 4U);
 }
 
diff --git a/gcc/config/sh/sh.opt b/gcc/config/sh/sh.opt
index 8618e78..f494ab8 100644
--- a/gcc/config/sh/sh.opt
+++ b/gcc/config/sh/sh.opt
@@ -207,7 +207,7 @@ Target RejectNegative Mask(ALIGN_DOUBLE)
 Align doubles at 64-bit boundaries.
 
 mdiv=
-Target RejectNegative Joined Var(sh_div_str) Init("")
+Target Save RejectNegative Joined Var(sh_div_str) Init("")
 Division strategy, one of: call-div1, call-fp, call-table.
 
 mdivsi3_libfunc=
diff --git a/gcc/config/sparc/sparc.cc b/gcc/config/sparc/sparc.cc
index bb4ce88..c72c38e 100644
--- a/gcc/config/sparc/sparc.cc
+++ b/gcc/config/sparc/sparc.cc
@@ -712,7 +712,8 @@ static bool sparc_modes_tieable_p (machine_mode, machine_mode);
 static bool sparc_can_change_mode_class (machine_mode, machine_mode,
 					 reg_class_t);
 static HOST_WIDE_INT sparc_constant_alignment (const_tree, HOST_WIDE_INT);
-static bool sparc_vectorize_vec_perm_const (machine_mode, rtx, rtx, rtx,
+static bool sparc_vectorize_vec_perm_const (machine_mode, machine_mode,
+					    rtx, rtx, rtx,
 					    const vec_perm_indices &);
 static bool sparc_can_follow_jump (const rtx_insn *, const rtx_insn *);
 static HARD_REG_SET sparc_zero_call_used_regs (HARD_REG_SET);
@@ -8884,8 +8885,20 @@ epilogue_renumber (rtx *where, int test)
       if (REGNO (*where) >= 8 && REGNO (*where) < 24)      /* oX or lX */
 	return 1;
       if (! test && REGNO (*where) >= 24 && REGNO (*where) < 32)
-	*where = gen_rtx_REG (GET_MODE (*where), OUTGOING_REGNO (REGNO(*where)));
-      /* fallthrough */
+	{
+	  if (ORIGINAL_REGNO (*where))
+	    {
+	      rtx n = gen_raw_REG (GET_MODE (*where),
+				   OUTGOING_REGNO (REGNO (*where)));
+	      ORIGINAL_REGNO (n) = ORIGINAL_REGNO (*where);
+	      *where = n;
+	    }
+	  else
+	    *where = gen_rtx_REG (GET_MODE (*where),
+				  OUTGOING_REGNO (REGNO (*where)));
+	}
+      return 0;
+
     case SCRATCH:
     case PC:
     case CONST_INT:
@@ -13023,15 +13036,19 @@ sparc_expand_vec_perm_bmask (machine_mode vmode, rtx sel)
 /* Implement TARGET_VEC_PERM_CONST.  */
 
 static bool
-sparc_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
-				rtx op1, const vec_perm_indices &sel)
+sparc_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
+				rtx target, rtx op0, rtx op1,
+				const vec_perm_indices &sel)
 {
+  if (vmode != op_mode)
+    return false;
+
   if (!TARGET_VIS2)
     return false;
 
-  /* All permutes are supported.  */
+  /* All 8-byte permutes are supported.  */
   if (!target)
-    return true;
+    return GET_MODE_SIZE (vmode) == 8;
 
   /* Force target-independent code to convert constant permutations on other
      modes down to V8QI.  Rely on this to avoid the complexity of the byte
diff --git a/gcc/config/tilepro/gen-mul-tables.cc b/gcc/config/tilepro/gen-mul-tables.cc
index c9649fb..5218398 100644
--- a/gcc/config/tilepro/gen-mul-tables.cc
+++ b/gcc/config/tilepro/gen-mul-tables.cc
@@ -90,6 +90,8 @@ typedef long long MUL_TYPE;
 #define MIN(x, y)  ((x) <= (y) ? (x) : (y))
 #define MAX(x, y)  ((x) >= (y) ? (x) : (y))
 
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+
 /* For this program a unary op is one which has only one nonconstant
    operand.  So shift left by 5 is considered unary.  */
 typedef MUL_TYPE (*unary_op_func) (MUL_TYPE);
@@ -462,7 +464,7 @@ find_sequences (ExpressionTree &s, ExpressionTreeMap &best_solution)
   const Operator *const prev_op = s.m_exprs[num_vals - 1].m_op;
   const int prev_top_index = (prev_op != NULL) ? prev_op->m_top_index : -1;
 
-  for (size_t f = 0; f < sizeof ops / sizeof ops[0]; f++)
+  for (size_t f = 0; f < ARRAY_SIZE (ops); f++)
     {
       const Operator *const op = &ops[f];
 
@@ -564,7 +566,7 @@ create_insn_code_compression_table ()
   printf ("const enum insn_code %s_multiply_insn_seq_decode_opcode[] = {\n"
 	  "  CODE_FOR_nothing /* must be first */ ", ARCH);
 
-  for (size_t i = 0; i < sizeof ops / sizeof ops[0]; i++)
+  for (size_t i = 0; i < ARRAY_SIZE (ops); i++)
     {
       Operator *op = &ops[i];
       int index = -1;
diff --git a/gcc/config/v850/v850-c.cc b/gcc/config/v850/v850-c.cc
index 45a3a0d..09ec540 100644
--- a/gcc/config/v850/v850-c.cc
+++ b/gcc/config/v850/v850-c.cc
@@ -64,7 +64,7 @@ static int
 pop_data_area (v850_data_area data_area)
 {
   if (data_area_stack == NULL)
-    warning (OPT_Wpragmas, "%<#pragma%> GHS endXXXX found without "
+    warning (OPT_Wpragmas, "%<#pragma%> GHS endXXX found without "
 	     "previous startXXX");
   else if (data_area != data_area_stack->data_area)
     warning (OPT_Wpragmas, "%<#pragma%> GHS endXXX does not match "
diff --git a/gcc/config/vms/vms.cc b/gcc/config/vms/vms.cc
index 5d565e3..d0d44ba 100644
--- a/gcc/config/vms/vms.cc
+++ b/gcc/config/vms/vms.cc
@@ -99,7 +99,7 @@ static const struct vms_crtl_name vms_crtl_names[] =
 
 /* Number of entires in the above array.  */
 
-#define NBR_CRTL_NAMES (sizeof (vms_crtl_names) / sizeof (*vms_crtl_names))
+#define NBR_CRTL_NAMES (ARRAY_SIZE (vms_crtl_names))
 
 /* List of aliased identifiers.  They must be persistent across gc.  */
 
diff --git a/gcc/config/xtensa/constraints.md b/gcc/config/xtensa/constraints.md
index 7fda33a..e7ac8db 100644
--- a/gcc/config/xtensa/constraints.md
+++ b/gcc/config/xtensa/constraints.md
@@ -92,7 +92,7 @@
  "An integer constant in the range @minus{}32-95 for use with MOVI.N
   instructions."
  (and (match_code "const_int")
-      (match_test "ival >= -32 && ival <= 95")))
+      (match_test "IN_RANGE (ival, -32, 95)")))
 
 (define_constraint "N"
  "An unsigned 8-bit integer constant shifted left by 8 bits for use
@@ -103,7 +103,7 @@
 (define_constraint "O"
  "An integer constant that can be used in ADDI.N instructions."
  (and (match_code "const_int")
-      (match_test "ival == -1 || (ival >= 1 && ival <= 15)")))
+      (match_test "ival == -1 || IN_RANGE (ival, 1, 15)")))
 
 (define_constraint "P"
  "An integer constant that can be used as a mask value in an EXTUI
diff --git a/gcc/config/xtensa/predicates.md b/gcc/config/xtensa/predicates.md
index 01226a2..edd13ae 100644
--- a/gcc/config/xtensa/predicates.md
+++ b/gcc/config/xtensa/predicates.md
@@ -25,8 +25,7 @@
 
 (define_predicate "addsubx_operand"
   (and (match_code "const_int")
-       (match_test "INTVAL (op) >= 1
-		    && INTVAL (op) <= 3")))
+       (match_test "IN_RANGE (INTVAL (op), 1, 3)")))
 
 (define_predicate "arith_operand"
   (ior (and (match_code "const_int")
@@ -53,9 +52,19 @@
 	    (match_test "xtensa_mask_immediate (INTVAL (op))"))
        (match_operand 0 "register_operand")))
 
+(define_predicate "shifted_mask_operand"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT mask = INTVAL (op);
+  int shift = ctz_hwi (mask);
+
+  return IN_RANGE (shift, 1, 31)
+	 && xtensa_mask_immediate ((uint32_t)mask >> shift);
+})
+
 (define_predicate "extui_fldsz_operand"
   (and (match_code "const_int")
-       (match_test "xtensa_mask_immediate ((1 << INTVAL (op)) - 1)")))
+       (match_test "IN_RANGE (INTVAL (op), 1, 16)")))
 
 (define_predicate "sext_operand"
   (if_then_else (match_test "TARGET_SEXT")
@@ -64,7 +73,7 @@
 
 (define_predicate "sext_fldsz_operand"
   (and (match_code "const_int")
-       (match_test "INTVAL (op) >= 8 && INTVAL (op) <= 23")))
+       (match_test "IN_RANGE (INTVAL (op), 8, 23)")))
 
 (define_predicate "lsbitnum_operand"
   (and (match_code "const_int")
@@ -156,6 +165,19 @@
   (and (match_code "const_int")
        (match_test "xtensa_mem_offset (INTVAL (op), SFmode)")))
 
+(define_predicate "reload_operand"
+  (match_code "mem")
+{
+  const_rtx addr = XEXP (op, 0);
+  if (REG_P (addr))
+    return REGNO (addr) == A1_REG;
+  if (GET_CODE (addr) == PLUS)
+    return REG_P (XEXP (addr, 0))
+	   && REGNO (XEXP (addr, 0)) == A1_REG
+	   && CONST_INT_P (XEXP (addr, 1));
+  return false;
+})
+
 (define_predicate "branch_operator"
   (match_code "eq,ne,lt,ge"))
 
@@ -165,9 +187,15 @@
 (define_predicate "boolean_operator"
   (match_code "eq,ne"))
 
+(define_predicate "logical_shift_operator"
+  (match_code "ashift,lshiftrt"))
+
 (define_predicate "xtensa_cstoresi_operator"
   (match_code "eq,ne,gt,ge,lt,le"))
 
+(define_predicate "xtensa_shift_per_byte_operator"
+  (match_code "ashift,ashiftrt,lshiftrt"))
+
 (define_predicate "tls_symbol_operand"
   (and (match_code "symbol_ref")
        (match_test "SYMBOL_REF_TLS_MODEL (op) != 0")))
diff --git a/gcc/config/xtensa/xtensa-protos.h b/gcc/config/xtensa/xtensa-protos.h
index 4bc42da..e020a33 100644
--- a/gcc/config/xtensa/xtensa-protos.h
+++ b/gcc/config/xtensa/xtensa-protos.h
@@ -41,18 +41,23 @@ extern void xtensa_expand_conditional_branch (rtx *, machine_mode);
 extern int xtensa_expand_conditional_move (rtx *, int);
 extern int xtensa_expand_scc (rtx *, machine_mode);
 extern int xtensa_expand_block_move (rtx *);
+extern int xtensa_expand_block_set_unrolled_loop (rtx *);
+extern int xtensa_expand_block_set_small_loop (rtx *);
 extern void xtensa_split_operand_pair (rtx *, machine_mode);
+extern int xtensa_constantsynth (rtx, HOST_WIDE_INT);
 extern int xtensa_emit_move_sequence (rtx *, machine_mode);
 extern rtx xtensa_copy_incoming_a7 (rtx);
 extern void xtensa_expand_nonlocal_goto (rtx *);
 extern void xtensa_expand_compare_and_swap (rtx, rtx, rtx, rtx);
 extern void xtensa_expand_atomic (enum rtx_code, rtx, rtx, rtx, bool);
 extern void xtensa_emit_loop_end (rtx_insn *, rtx *);
-extern char *xtensa_emit_branch (bool, bool, rtx *);
-extern char *xtensa_emit_bit_branch (bool, bool, rtx *);
+extern char *xtensa_emit_branch (bool, rtx *);
 extern char *xtensa_emit_movcc (bool, bool, bool, rtx *);
+extern void xtensa_prepare_expand_call (int, rtx *);
 extern char *xtensa_emit_call (int, rtx *);
+extern char *xtensa_emit_sibcall (int, rtx *);
 extern bool xtensa_tls_referenced_p (rtx);
+extern enum rtx_code xtensa_shlrd_which_direction (rtx, rtx);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, int);
@@ -70,7 +75,7 @@ extern int xtensa_dbx_register_number (int);
 extern long compute_frame_size (poly_int64);
 extern bool xtensa_use_return_instruction_p (void);
 extern void xtensa_expand_prologue (void);
-extern void xtensa_expand_epilogue (void);
+extern void xtensa_expand_epilogue (bool);
 extern void order_regs_for_local_alloc (void);
 extern enum reg_class xtensa_regno_to_class (int regno);
 extern HOST_WIDE_INT xtensa_initial_elimination_offset (int from, int to);
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 7023bad..d6f08b1 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -55,6 +55,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "dumpfile.h"
 #include "hw-doloop.h"
 #include "rtl-iter.h"
+#include "insn-attr.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -117,7 +118,7 @@ const char xtensa_leaf_regs[FIRST_PSEUDO_REGISTER] =
 
 static void xtensa_option_override (void);
 static enum internal_test map_test_to_internal_test (enum rtx_code);
-static rtx gen_int_relational (enum rtx_code, rtx, rtx, int *);
+static rtx gen_int_relational (enum rtx_code, rtx, rtx);
 static rtx gen_float_relational (enum rtx_code, rtx, rtx);
 static rtx gen_conditional_move (enum rtx_code, machine_mode, rtx, rtx);
 static rtx fixup_subreg_mem (rtx);
@@ -134,6 +135,7 @@ static unsigned int xtensa_multibss_section_type_flags (tree, const char *,
 static section *xtensa_select_rtx_section (machine_mode, rtx,
 					   unsigned HOST_WIDE_INT);
 static bool xtensa_rtx_costs (rtx, machine_mode, int, int, int *, bool);
+static int xtensa_insn_cost (rtx_insn *, bool);
 static int xtensa_register_move_cost (machine_mode, reg_class_t,
 				      reg_class_t);
 static int xtensa_memory_move_cost (machine_mode, reg_class_t, bool);
@@ -187,7 +189,7 @@ static bool xtensa_can_eliminate (const int from ATTRIBUTE_UNUSED,
 				  const int to);
 static HOST_WIDE_INT xtensa_starting_frame_offset (void);
 static unsigned HOST_WIDE_INT xtensa_asan_shadow_offset (void);
-
+static bool xtensa_function_ok_for_sibcall (tree, tree);
 static rtx xtensa_delegitimize_address (rtx);
 
 
@@ -212,6 +214,8 @@ static rtx xtensa_delegitimize_address (rtx);
 #define TARGET_MEMORY_MOVE_COST xtensa_memory_move_cost
 #undef TARGET_RTX_COSTS
 #define TARGET_RTX_COSTS xtensa_rtx_costs
+#undef TARGET_INSN_COST
+#define TARGET_INSN_COST xtensa_insn_cost
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST hook_int_rtx_mode_as_bool_0
 
@@ -343,6 +347,9 @@ static rtx xtensa_delegitimize_address (rtx);
 #undef TARGET_DELEGITIMIZE_ADDRESS
 #define TARGET_DELEGITIMIZE_ADDRESS xtensa_delegitimize_address
 
+#undef TARGET_FUNCTION_OK_FOR_SIBCALL
+#define TARGET_FUNCTION_OK_FOR_SIBCALL xtensa_function_ok_for_sibcall
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 
@@ -351,42 +358,42 @@ struct gcc_target targetm = TARGET_INITIALIZER;
 bool
 xtensa_simm8 (HOST_WIDE_INT v)
 {
-  return v >= -128 && v <= 127;
+  return IN_RANGE (v, -128, 127);
 }
 
 
 bool
 xtensa_simm8x256 (HOST_WIDE_INT v)
 {
-  return (v & 255) == 0 && (v >= -32768 && v <= 32512);
+  return (v & 255) == 0 && IN_RANGE (v, -32768, 32512);
 }
 
 
 bool
 xtensa_simm12b (HOST_WIDE_INT v)
 {
-  return v >= -2048 && v <= 2047;
+  return IN_RANGE (v, -2048, 2047);
 }
 
 
 static bool
 xtensa_uimm8 (HOST_WIDE_INT v)
 {
-  return v >= 0 && v <= 255;
+  return IN_RANGE (v, 0, 255);
 }
 
 
 static bool
 xtensa_uimm8x2 (HOST_WIDE_INT v)
 {
-  return (v & 1) == 0 && (v >= 0 && v <= 510);
+  return (v & 1) == 0 && IN_RANGE (v, 0, 510);
 }
 
 
 static bool
 xtensa_uimm8x4 (HOST_WIDE_INT v)
 {
-  return (v & 3) == 0 && (v >= 0 && v <= 1020);
+  return (v & 3) == 0 && IN_RANGE (v, 0, 1020);
 }
 
 
@@ -456,19 +463,7 @@ xtensa_b4constu (HOST_WIDE_INT v)
 bool
 xtensa_mask_immediate (HOST_WIDE_INT v)
 {
-#define MAX_MASK_SIZE 16
-  int mask_size;
-
-  for (mask_size = 1; mask_size <= MAX_MASK_SIZE; mask_size++)
-    {
-      if ((v & 1) == 0)
-	return false;
-      v = v >> 1;
-      if (v == 0)
-	return true;
-    }
-
-  return false;
+  return IN_RANGE (exact_log2 (v + 1), 1, 16);
 }
 
 
@@ -549,7 +544,7 @@ smalloffset_mem_p (rtx op)
 	    return FALSE;
 
 	  val = INTVAL (offset);
-	  return (val & 3) == 0 && (val >= 0 && val <= 60);
+	  return (val & 3) == 0 && IN_RANGE (val, 0, 60);
 	}
     }
   return FALSE;
@@ -688,8 +683,7 @@ map_test_to_internal_test (enum rtx_code test_code)
 static rtx
 gen_int_relational (enum rtx_code test_code, /* relational test (EQ, etc) */
 		    rtx cmp0, /* first operand to compare */
-		    rtx cmp1, /* second operand to compare */
-		    int *p_invert /* whether branch needs to reverse test */)
+		    rtx cmp1 /* second operand to compare */)
 {
   struct cmp_info
   {
@@ -721,6 +715,7 @@ gen_int_relational (enum rtx_code test_code, /* relational test (EQ, etc) */
   enum internal_test test;
   machine_mode mode;
   struct cmp_info *p_info;
+  int invert;
 
   test = map_test_to_internal_test (test_code);
   gcc_assert (test != ITEST_MAX);
@@ -757,9 +752,9 @@ gen_int_relational (enum rtx_code test_code, /* relational test (EQ, etc) */
     }
 
   /* See if we need to invert the result.  */
-  *p_invert = ((GET_CODE (cmp1) == CONST_INT)
-	       ? p_info->invert_const
-	       : p_info->invert_reg);
+  invert = ((GET_CODE (cmp1) == CONST_INT)
+	    ? p_info->invert_const
+	    : p_info->invert_reg);
 
   /* Comparison to constants, may involve adding 1 to change a LT into LE.
      Comparison between two registers, may involve switching operands.  */
@@ -776,7 +771,9 @@ gen_int_relational (enum rtx_code test_code, /* relational test (EQ, etc) */
       cmp1 = temp;
     }
 
-  return gen_rtx_fmt_ee (p_info->test_code, VOIDmode, cmp0, cmp1);
+  return gen_rtx_fmt_ee (invert ? reverse_condition (p_info->test_code)
+				: p_info->test_code,
+			 VOIDmode, cmp0, cmp1);
 }
 
 
@@ -835,45 +832,33 @@ xtensa_expand_conditional_branch (rtx *operands, machine_mode mode)
   enum rtx_code test_code = GET_CODE (operands[0]);
   rtx cmp0 = operands[1];
   rtx cmp1 = operands[2];
-  rtx cmp;
-  int invert;
-  rtx label1, label2;
+  rtx cmp, label;
 
   switch (mode)
     {
+    case E_SFmode:
+      if (TARGET_HARD_FLOAT)
+	{
+	  cmp = gen_float_relational (test_code, cmp0, cmp1);
+	  break;
+	}
+      /* FALLTHRU */
+
     case E_DFmode:
     default:
       fatal_insn ("bad test", gen_rtx_fmt_ee (test_code, VOIDmode, cmp0, cmp1));
 
     case E_SImode:
-      invert = FALSE;
-      cmp = gen_int_relational (test_code, cmp0, cmp1, &invert);
-      break;
-
-    case E_SFmode:
-      if (!TARGET_HARD_FLOAT)
-	fatal_insn ("bad test", gen_rtx_fmt_ee (test_code, VOIDmode,
-						cmp0, cmp1));
-      invert = FALSE;
-      cmp = gen_float_relational (test_code, cmp0, cmp1);
+      cmp = gen_int_relational (test_code, cmp0, cmp1);
       break;
     }
 
   /* Generate the branch.  */
-
-  label1 = gen_rtx_LABEL_REF (VOIDmode, operands[3]);
-  label2 = pc_rtx;
-
-  if (invert)
-    {
-      label2 = label1;
-      label1 = pc_rtx;
-    }
-
+  label = gen_rtx_LABEL_REF (VOIDmode, operands[3]);
   emit_jump_insn (gen_rtx_SET (pc_rtx,
 			       gen_rtx_IF_THEN_ELSE (VOIDmode, cmp,
-						     label1,
-						     label2)));
+						     label,
+						     pc_rtx)));
 }
 
 
@@ -1045,6 +1030,123 @@ xtensa_split_operand_pair (rtx operands[4], machine_mode mode)
 }
 
 
+/* Try to emit insns to load srcval (that cannot fit into signed 12-bit)
+   into dst with synthesizing a such constant value from a sequence of
+   load-immediate / arithmetic ones, instead of a L32R instruction
+   (plus a constant in litpool).  */
+
+static void
+xtensa_emit_constantsynth (rtx dst, enum rtx_code code,
+			   HOST_WIDE_INT imm0, HOST_WIDE_INT imm1,
+			   rtx (*gen_op)(rtx, HOST_WIDE_INT),
+			   HOST_WIDE_INT imm2)
+{
+  gcc_assert (REG_P (dst));
+  emit_move_insn (dst, GEN_INT (imm0));
+  emit_move_insn (dst, gen_rtx_fmt_ee (code, SImode,
+				       dst, GEN_INT (imm1)));
+  if (gen_op)
+    emit_move_insn (dst, gen_op (dst, imm2));
+}
+
+static int
+xtensa_constantsynth_2insn (rtx dst, HOST_WIDE_INT srcval,
+			    rtx (*gen_op)(rtx, HOST_WIDE_INT),
+			    HOST_WIDE_INT op_imm)
+{
+  int shift = exact_log2 (srcval + 1);
+
+  if (IN_RANGE (shift, 1, 31))
+    {
+      xtensa_emit_constantsynth (dst, LSHIFTRT, -1, 32 - shift,
+				 gen_op, op_imm);
+      return 1;
+    }
+
+  if (IN_RANGE (srcval, (-2048 - 32768), (2047 + 32512)))
+    {
+      HOST_WIDE_INT imm0, imm1;
+
+      if (srcval < -32768)
+	imm1 = -32768;
+      else if (srcval > 32512)
+	imm1 = 32512;
+      else
+	imm1 = srcval & ~255;
+      imm0 = srcval - imm1;
+      if (TARGET_DENSITY && imm1 < 32512 && IN_RANGE (imm0, 224, 255))
+	imm0 -= 256, imm1 += 256;
+      xtensa_emit_constantsynth (dst, PLUS, imm0, imm1, gen_op, op_imm);
+	return 1;
+    }
+
+  shift = ctz_hwi (srcval);
+  if (xtensa_simm12b (srcval >> shift))
+    {
+      xtensa_emit_constantsynth (dst, ASHIFT, srcval >> shift, shift,
+				 gen_op, op_imm);
+      return 1;
+    }
+
+  return 0;
+}
+
+static rtx
+xtensa_constantsynth_rtx_SLLI (rtx reg, HOST_WIDE_INT imm)
+{
+  return gen_rtx_ASHIFT (SImode, reg, GEN_INT (imm));
+}
+
+static rtx
+xtensa_constantsynth_rtx_ADDSUBX (rtx reg, HOST_WIDE_INT imm)
+{
+  return imm == 7
+	 ? gen_rtx_MINUS (SImode, gen_rtx_ASHIFT (SImode, reg, GEN_INT (3)),
+			  reg)
+	 : gen_rtx_PLUS (SImode, gen_rtx_ASHIFT (SImode, reg,
+						 GEN_INT (floor_log2 (imm - 1))),
+			 reg);
+}
+
+int
+xtensa_constantsynth (rtx dst, HOST_WIDE_INT srcval)
+{
+  /* No need for synthesizing for what fits into MOVI instruction.  */
+  if (xtensa_simm12b (srcval))
+    return 0;
+
+  /* 2-insns substitution.  */
+  if ((optimize_size || (optimize && xtensa_extra_l32r_costs >= 1))
+      && xtensa_constantsynth_2insn (dst, srcval, NULL, 0))
+    return 1;
+
+  /* 3-insns substitution.  */
+  if (optimize > 1 && !optimize_size && xtensa_extra_l32r_costs >= 2)
+    {
+      int shift, divisor;
+
+      /* 2-insns substitution followed by SLLI.  */
+      shift = ctz_hwi (srcval);
+      if (IN_RANGE (shift, 1, 31) &&
+	  xtensa_constantsynth_2insn (dst, srcval >> shift,
+				      xtensa_constantsynth_rtx_SLLI,
+				      shift))
+	return 1;
+
+      /* 2-insns substitution followed by ADDX[248] or SUBX8.  */
+      if (TARGET_ADDX)
+	for (divisor = 3; divisor <= 9; divisor += 2)
+	  if (srcval % divisor == 0 &&
+	      xtensa_constantsynth_2insn (dst, srcval / divisor,
+					  xtensa_constantsynth_rtx_ADDSUBX,
+					  divisor))
+	    return 1;
+    }
+
+  return 0;
+}
+
+
 /* Emit insns to move operands[1] into operands[0].
    Return 1 if we have written out everything that needs to be done to
    do the move.  Otherwise, return 0 and the caller will emit the move
@@ -1082,22 +1184,6 @@ xtensa_emit_move_sequence (rtx *operands, machine_mode mode)
 
       if (! TARGET_AUTO_LITPOOLS && ! TARGET_CONST16)
 	{
-	  /* Try to emit MOVI + SLLI sequence, that is smaller
-	     than L32R + literal.  */
-	  if (optimize_size && mode == SImode && CONST_INT_P (src)
-	      && register_operand (dst, mode))
-	    {
-	      HOST_WIDE_INT srcval = INTVAL (src);
-	      int shift = ctz_hwi (srcval);
-
-	      if (xtensa_simm12b (srcval >> shift))
-		{
-		  emit_move_insn (dst, GEN_INT (srcval >> shift));
-		  emit_insn (gen_ashlsi3_internal (dst, dst, GEN_INT (shift)));
-		  return 1;
-		}
-	    }
-
 	  src = force_const_mem (SImode, src);
 	  operands[1] = src;
 	}
@@ -1325,7 +1411,7 @@ xtensa_expand_block_move (rtx *operands)
   move_ratio = 4;
   if (optimize > 2)
     move_ratio = LARGEST_MOVE_RATIO;
-  num_pieces = (bytes / align) + (bytes % align); /* Close enough anyway.  */
+  num_pieces = (bytes / align) + ((bytes % align + 1) / 2);
   if (num_pieces > move_ratio)
     return 0;
 
@@ -1362,7 +1448,7 @@ xtensa_expand_block_move (rtx *operands)
 	  temp[next] = gen_reg_rtx (mode[next]);
 
 	  x = adjust_address (src_mem, mode[next], offset_ld);
-	  emit_insn (gen_rtx_SET (temp[next], x));
+	  emit_move_insn (temp[next], x);
 
 	  offset_ld += next_amount;
 	  bytes -= next_amount;
@@ -1372,9 +1458,9 @@ xtensa_expand_block_move (rtx *operands)
       if (active[phase])
 	{
 	  active[phase] = false;
-	  
+
 	  x = adjust_address (dst_mem, mode[phase], offset_st);
-	  emit_insn (gen_rtx_SET (x, temp[phase]));
+	  emit_move_insn (x, temp[phase]);
 
 	  offset_st += amount[phase];
 	}
@@ -1385,6 +1471,246 @@ xtensa_expand_block_move (rtx *operands)
 }
 
 
+/* Try to expand a block set operation to a sequence of RTL move
+   instructions.  If not optimizing, or if the block size is not a
+   constant, or if the block is too large, or if the value to
+   initialize the block with is not a constant, the expansion
+   fails and GCC falls back to calling memset().
+
+   operands[0] is the destination
+   operands[1] is the length
+   operands[2] is the initialization value
+   operands[3] is the alignment */
+
+static int
+xtensa_sizeof_MOVI (HOST_WIDE_INT imm)
+{
+  return (TARGET_DENSITY && IN_RANGE (imm, -32, 95)) ? 2 : 3;
+}
+
+int
+xtensa_expand_block_set_unrolled_loop (rtx *operands)
+{
+  rtx dst_mem = operands[0];
+  HOST_WIDE_INT bytes, value, align;
+  int expand_len, funccall_len;
+  rtx x, reg;
+  int offset;
+
+  if (!CONST_INT_P (operands[1]) || !CONST_INT_P (operands[2]))
+    return 0;
+
+  bytes = INTVAL (operands[1]);
+  if (bytes <= 0)
+    return 0;
+  value = (int8_t)INTVAL (operands[2]);
+  align = INTVAL (operands[3]);
+  if (align > MOVE_MAX)
+    align = MOVE_MAX;
+
+  /* Insn expansion: holding the init value.
+     Either MOV(.N) or L32R w/litpool.  */
+  if (align == 1)
+    expand_len = xtensa_sizeof_MOVI (value);
+  else if (value == 0 || value == -1)
+    expand_len = TARGET_DENSITY ? 2 : 3;
+  else
+    expand_len = 3 + 4;
+  /* Insn expansion: a series of aligned memory stores.
+     Consist of S8I, S16I or S32I(.N).  */
+  expand_len += (bytes / align) * (TARGET_DENSITY
+				   && align == 4 ? 2 : 3);
+  /* Insn expansion: the remainder, sub-aligned memory stores.
+     A combination of S8I and S16I as needed.  */
+  expand_len += ((bytes % align + 1) / 2) * 3;
+
+  /* Function call: preparing two arguments.  */
+  funccall_len = xtensa_sizeof_MOVI (value);
+  funccall_len += xtensa_sizeof_MOVI (bytes);
+  /* Function call: calling memset().  */
+  funccall_len += TARGET_LONGCALLS ? (3 + 4 + 3) : 3;
+
+  /* Apply expansion bonus (2x) if optimizing for speed.  */
+  if (optimize > 1 && !optimize_size)
+    funccall_len *= 2;
+
+  /* Decide whether to expand or not, based on the sum of the length
+     of instructions.  */
+  if (expand_len > funccall_len)
+    return 0;
+
+  x = XEXP (dst_mem, 0);
+  if (!REG_P (x))
+    dst_mem = replace_equiv_address (dst_mem, force_reg (Pmode, x));
+  switch (align)
+    {
+    case 1:
+      break;
+    case 2:
+      value = (int16_t)((uint8_t)value * 0x0101U);
+      break;
+    case 4:
+      value = (int32_t)((uint8_t)value * 0x01010101U);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  reg = force_reg (SImode, GEN_INT (value));
+
+  offset = 0;
+  do
+    {
+      int unit_size = MIN (bytes, align);
+      machine_mode unit_mode = (unit_size >= 4 ? SImode :
+			       (unit_size >= 2 ? HImode :
+						 QImode));
+      unit_size = GET_MODE_SIZE (unit_mode);
+
+      emit_move_insn (adjust_address (dst_mem, unit_mode, offset),
+		      unit_mode == SImode ? reg
+		      : convert_to_mode (unit_mode, reg, true));
+
+      offset += unit_size;
+      bytes -= unit_size;
+    }
+  while (bytes > 0);
+
+  return 1;
+}
+
+int
+xtensa_expand_block_set_small_loop (rtx *operands)
+{
+  HOST_WIDE_INT bytes, value, align, count;
+  int expand_len, funccall_len;
+  rtx x, dst, end, reg;
+  machine_mode unit_mode;
+  rtx_code_label *label;
+
+  if (!CONST_INT_P (operands[1]) || !CONST_INT_P (operands[2]))
+    return 0;
+
+  bytes = INTVAL (operands[1]);
+  if (bytes <= 0)
+    return 0;
+  value = (int8_t)INTVAL (operands[2]);
+  align = INTVAL (operands[3]);
+  if (align > MOVE_MAX)
+    align = MOVE_MAX;
+
+  /* Totally-aligned block only.  */
+  if (bytes % align != 0)
+    return 0;
+  count = bytes / align;
+
+  /* If the Loop Option (zero-overhead looping) is configured and active,
+     almost no restrictions about the length of the block.  */
+  if (! (TARGET_LOOPS && optimize))
+    {
+      /* If 4-byte aligned, small loop substitution is almost optimal,
+	 thus limited to only offset to the end address for ADDI/ADDMI
+	 instruction.  */
+      if (align == 4
+	  && ! (bytes <= 127 || (bytes <= 32512 && bytes % 256 == 0)))
+	return 0;
+
+      /* If no 4-byte aligned, loop count should be treated as the
+	 constraint.  */
+      if (align != 4
+	  && count > ((optimize > 1 && !optimize_size) ? 8 : 15))
+	return 0;
+    }
+
+  /* Insn expansion: holding the init value.
+     Either MOV(.N) or L32R w/litpool.  */
+  if (align == 1)
+    expand_len = xtensa_sizeof_MOVI (value);
+  else if (value == 0 || value == -1)
+    expand_len = TARGET_DENSITY ? 2 : 3;
+  else
+    expand_len = 3 + 4;
+  if (TARGET_LOOPS && optimize) /* zero-overhead looping */
+    {
+      /* Insn translation: Either MOV(.N) or L32R w/litpool for the
+	 loop count.  */
+      expand_len += xtensa_simm12b (count) ? xtensa_sizeof_MOVI (count)
+					   : 3 + 4;
+      /* Insn translation: LOOP, the zero-overhead looping setup
+	 instruction.  */
+      expand_len += 3;
+      /* Insn expansion: the loop body instructions.
+	For store, one of S8I, S16I or S32I(.N).
+	For advance, ADDI(.N).  */
+      expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
+		    + (TARGET_DENSITY ? 2 : 3);
+    }
+  else /* NO zero-overhead looping */
+    {
+      /* Insn expansion: Either ADDI(.N) or ADDMI for the end address.  */
+      expand_len += bytes > 127 ? 3
+				: (TARGET_DENSITY && bytes <= 15) ? 2 : 3;
+      /* Insn expansion: the loop body and branch instruction.
+	For store, one of S8I, S16I or S32I(.N).
+	For advance, ADDI(.N).
+	For branch, BNE.  */
+      expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
+		    + (TARGET_DENSITY ? 2 : 3) + 3;
+    }
+
+  /* Function call: preparing two arguments.  */
+  funccall_len = xtensa_sizeof_MOVI (value);
+  funccall_len += xtensa_sizeof_MOVI (bytes);
+  /* Function call: calling memset().  */
+  funccall_len += TARGET_LONGCALLS ? (3 + 4 + 3) : 3;
+
+  /* Apply expansion bonus (2x) if optimizing for speed.  */
+  if (optimize > 1 && !optimize_size)
+    funccall_len *= 2;
+
+  /* Decide whether to expand or not, based on the sum of the length
+     of instructions.  */
+  if (expand_len > funccall_len)
+    return 0;
+
+  x = XEXP (operands[0], 0);
+  if (!REG_P (x))
+    x = XEXP (replace_equiv_address (operands[0], force_reg (Pmode, x)), 0);
+  dst = gen_reg_rtx (SImode);
+  emit_move_insn (dst, x);
+  end = gen_reg_rtx (SImode);
+  if (TARGET_LOOPS && optimize)
+    x = force_reg (SImode, operands[1] /* the length */);
+  else
+    x = operands[1];
+  emit_insn (gen_addsi3 (end, dst, x));
+  switch (align)
+    {
+    case 1:
+      unit_mode = QImode;
+      break;
+    case 2:
+      value = (int16_t)((uint8_t)value * 0x0101U);
+      unit_mode = HImode;
+      break;
+    case 4:
+      value = (int32_t)((uint8_t)value * 0x01010101U);
+      unit_mode = SImode;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  reg = force_reg (unit_mode, GEN_INT (value));
+
+  label = gen_label_rtx ();
+  emit_label (label);
+  emit_move_insn (gen_rtx_MEM (unit_mode, dst), reg);
+  emit_insn (gen_addsi3 (dst, dst, GEN_INT (align)));
+  emit_cmp_and_jump_insns (dst, end, NE, const0_rtx, SImode, true, label);
+
+  return 1;
+}
+
+
 void
 xtensa_expand_nonlocal_goto (rtx *operands)
 {
@@ -1735,21 +2061,20 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx *operands)
 
 
 char *
-xtensa_emit_branch (bool inverted, bool immed, rtx *operands)
+xtensa_emit_branch (bool immed, rtx *operands)
 {
   static char result[64];
-  enum rtx_code code;
+  enum rtx_code code = GET_CODE (operands[3]);
   const char *op;
 
-  code = GET_CODE (operands[3]);
   switch (code)
     {
-    case EQ:	op = inverted ? "ne" : "eq"; break;
-    case NE:	op = inverted ? "eq" : "ne"; break;
-    case LT:	op = inverted ? "ge" : "lt"; break;
-    case GE:	op = inverted ? "lt" : "ge"; break;
-    case LTU:	op = inverted ? "geu" : "ltu"; break;
-    case GEU:	op = inverted ? "ltu" : "geu"; break;
+    case EQ:	op = "eq"; break;
+    case NE:	op = "ne"; break;
+    case LT:	op = "lt"; break;
+    case GE:	op = "ge"; break;
+    case LTU:	op = "ltu"; break;
+    case GEU:	op = "geu"; break;
     default:	gcc_unreachable ();
     }
 
@@ -1769,32 +2094,6 @@ xtensa_emit_branch (bool inverted, bool immed, rtx *operands)
 
 
 char *
-xtensa_emit_bit_branch (bool inverted, bool immed, rtx *operands)
-{
-  static char result[64];
-  const char *op;
-
-  switch (GET_CODE (operands[3]))
-    {
-    case EQ:	op = inverted ? "bs" : "bc"; break;
-    case NE:	op = inverted ? "bc" : "bs"; break;
-    default:	gcc_unreachable ();
-    }
-
-  if (immed)
-    {
-      unsigned bitnum = INTVAL (operands[1]) & 0x1f; 
-      operands[1] = GEN_INT (bitnum); 
-      sprintf (result, "b%si\t%%0, %%d1, %%2", op);
-    }
-  else
-    sprintf (result, "b%s\t%%0, %%1, %%2", op);
-
-  return result;
-}
-
-
-char *
 xtensa_emit_movcc (bool inverted, bool isfp, bool isbool, rtx *operands)
 {
   static char result[64];
@@ -1802,12 +2101,14 @@ xtensa_emit_movcc (bool inverted, bool isfp, bool isbool, rtx *operands)
   const char *op;
 
   code = GET_CODE (operands[4]);
+  if (inverted)
+    code = reverse_condition (code);
   if (isbool)
     {
       switch (code)
 	{
-	case EQ:	op = inverted ? "t" : "f"; break;
-	case NE:	op = inverted ? "f" : "t"; break;
+	case EQ:	op = "f"; break;
+	case NE:	op = "t"; break;
 	default:	gcc_unreachable ();
 	}
     }
@@ -1815,10 +2116,10 @@ xtensa_emit_movcc (bool inverted, bool isfp, bool isbool, rtx *operands)
     {
       switch (code)
 	{
-	case EQ:	op = inverted ? "nez" : "eqz"; break;
-	case NE:	op = inverted ? "eqz" : "nez"; break;
-	case LT:	op = inverted ? "gez" : "ltz"; break;
-	case GE:	op = inverted ? "ltz" : "gez"; break;
+	case EQ:	op = "eqz"; break;
+	case NE:	op = "nez"; break;
+	case LT:	op = "ltz"; break;
+	case GE:	op = "gez"; break;
 	default:	gcc_unreachable ();
 	}
     }
@@ -1829,6 +2130,20 @@ xtensa_emit_movcc (bool inverted, bool isfp, bool isbool, rtx *operands)
 }
 
 
+void
+xtensa_prepare_expand_call (int callop, rtx *operands)
+{
+  rtx addr = XEXP (operands[callop], 0);
+
+  if (flag_pic && SYMBOL_REF_P (addr)
+      && (!SYMBOL_REF_LOCAL_P (addr) || SYMBOL_REF_EXTERNAL_P (addr)))
+    addr = gen_sym_PLT (addr);
+
+  if (!call_insn_operand (addr, VOIDmode))
+    XEXP (operands[callop], 0) = copy_to_mode_reg (Pmode, addr);
+}
+
+
 char *
 xtensa_emit_call (int callop, rtx *operands)
 {
@@ -1847,6 +2162,24 @@ xtensa_emit_call (int callop, rtx *operands)
 }
 
 
+char *
+xtensa_emit_sibcall (int callop, rtx *operands)
+{
+  static char result[64];
+  rtx tgt = operands[callop];
+
+  if (GET_CODE (tgt) == CONST_INT)
+    sprintf (result, "j.l\t" HOST_WIDE_INT_PRINT_HEX ", a9",
+	     INTVAL (tgt));
+  else if (register_operand (tgt, VOIDmode))
+    sprintf (result, "jx\t%%%d", callop);
+  else
+    sprintf (result, "j.l\t%%%d, a9", callop);
+
+  return result;
+}
+
+
 bool
 xtensa_legitimate_address_p (machine_mode mode, rtx addr, bool strict)
 {
@@ -2070,6 +2403,20 @@ xtensa_tls_referenced_p (rtx x)
 }
 
 
+/* Helper function for "*shlrd_..." patterns.  */
+
+enum rtx_code
+xtensa_shlrd_which_direction (rtx op0, rtx op1)
+{
+  if (GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
+    return ASHIFT;	/* shld  */
+  if (GET_CODE (op0) == LSHIFTRT && GET_CODE (op1) == ASHIFT)
+    return LSHIFTRT;	/* shrd  */
+
+  return UNKNOWN;
+}
+
+
 /* Implement TARGET_CANNOT_FORCE_CONST_MEM.  */
 
 static bool
@@ -2379,7 +2726,7 @@ static void
 printx (FILE *file, signed int val)
 {
   /* Print a hexadecimal value in a nice way.  */
-  if ((val > -0xa) && (val < 0xa))
+  if (IN_RANGE (val, -9, 9))
     fprintf (file, "%d", val);
   else if (val < 0)
     fprintf (file, "-0x%x", -val);
@@ -2430,17 +2777,11 @@ print_operand (FILE *file, rtx x, int letter)
     case 'K':
       if (GET_CODE (x) == CONST_INT)
 	{
-	  int num_bits = 0;
 	  unsigned val = INTVAL (x);
-	  while (val & 1)
-	    {
-	      num_bits += 1;
-	      val = val >> 1;
-	    }
-	  if ((val != 0) || (num_bits == 0) || (num_bits > 16))
+	  if (!xtensa_mask_immediate (val))
 	    fatal_insn ("invalid mask", x);
 
-	  fprintf (file, "%d", num_bits);
+	  fprintf (file, "%d", floor_log2 (val + 1));
 	}
       else
 	output_operand_lossage ("invalid %%K value");
@@ -2715,7 +3056,7 @@ xtensa_call_save_reg(int regno)
     return crtl->profile || !crtl->is_leaf || crtl->calls_eh_return ||
       df_regs_ever_live_p (regno);
 
-  if (crtl->calls_eh_return && regno >= 2 && regno < 4)
+  if (crtl->calls_eh_return && IN_RANGE (regno, 2, 3))
     return true;
 
   return !call_used_or_fixed_reg_p (regno) && df_regs_ever_live_p (regno);
@@ -2835,7 +3176,7 @@ xtensa_expand_prologue (void)
       int callee_save_size = cfun->machine->callee_save_size;
 
       /* -128 is a limit of single addi instruction. */
-      if (total_size > 0 && total_size <= 128)
+      if (IN_RANGE (total_size, 1, 128))
 	{
 	  insn = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
 					GEN_INT (-total_size)));
@@ -2964,7 +3305,7 @@ xtensa_expand_prologue (void)
 }
 
 void
-xtensa_expand_epilogue (void)
+xtensa_expand_epilogue (bool sibcall_p)
 {
   if (!TARGET_WINDOWED_ABI)
     {
@@ -2998,10 +3339,13 @@ xtensa_expand_epilogue (void)
 	  if (xtensa_call_save_reg(regno))
 	    {
 	      rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
+	      rtx reg;
 
 	      offset -= UNITS_PER_WORD;
-	      emit_move_insn (gen_rtx_REG (SImode, regno),
+	      emit_move_insn (reg = gen_rtx_REG (SImode, regno),
 			      gen_frame_mem (SImode, x));
+	      if (regno == A0_REG && sibcall_p)
+		emit_use (reg);
 	    }
 	}
 
@@ -3036,7 +3380,8 @@ xtensa_expand_epilogue (void)
 				  EH_RETURN_STACKADJ_RTX));
     }
   cfun->machine->epilogue_done = true;
-  emit_jump_insn (gen_return ());
+  if (!sibcall_p)
+    emit_jump_insn (gen_return ());
 }
 
 bool
@@ -3715,7 +4060,7 @@ xtensa_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
 static bool
 xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code,
 		  int opno ATTRIBUTE_UNUSED,
-		  int *total, bool speed ATTRIBUTE_UNUSED)
+		  int *total, bool speed)
 {
   int code = GET_CODE (x);
 
@@ -3803,9 +4148,14 @@ xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code,
       return true;
 
     case CLZ:
+    case CLRSB:
       *total = COSTS_N_INSNS (TARGET_NSA ? 1 : 50);
       return true;
 
+    case BSWAP:
+      *total = COSTS_N_INSNS (mode == HImode ? 3 : 5);
+      return true;
+
     case NOT:
       *total = COSTS_N_INSNS (mode == DImode ? 3 : 2);
       return true;
@@ -3829,13 +4179,16 @@ xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code,
       return true;
 
     case ABS:
+    case NEG:
       {
 	if (mode == SFmode)
 	  *total = COSTS_N_INSNS (TARGET_HARD_FLOAT ? 1 : 50);
 	else if (mode == DFmode)
 	  *total = COSTS_N_INSNS (50);
-	else
+	else if (mode == DImode)
 	  *total = COSTS_N_INSNS (4);
+	else
+	  *total = COSTS_N_INSNS (1);
 	return true;
       }
 
@@ -3851,10 +4204,6 @@ xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code,
 	return true;
       }
 
-    case NEG:
-      *total = COSTS_N_INSNS (mode == DImode ? 4 : 2);
-      return true;
-
     case MULT:
       {
 	if (mode == SFmode)
@@ -3894,11 +4243,11 @@ xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code,
     case UMOD:
       {
 	if (mode == DImode)
-	  *total = COSTS_N_INSNS (50);
+	  *total = COSTS_N_INSNS (speed ? 100 : 50);
 	else if (TARGET_DIV32)
 	  *total = COSTS_N_INSNS (32);
 	else
-	  *total = COSTS_N_INSNS (50);
+	  *total = COSTS_N_INSNS (speed ? 100 : 50);
 	return true;
       }
 
@@ -3931,6 +4280,98 @@ xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code,
     }
 }
 
+static bool
+xtensa_is_insn_L32R_p(const rtx_insn *insn)
+{
+  rtx x = PATTERN (insn);
+
+  if (GET_CODE (x) == SET)
+    {
+      x = XEXP (x, 1);
+      if (GET_CODE (x) == MEM)
+	{
+	  x = XEXP (x, 0);
+	  return (GET_CODE (x) == SYMBOL_REF || CONST_INT_P (x))
+		 && CONSTANT_POOL_ADDRESS_P (x);
+	}
+    }
+
+  return false;
+}
+
+/* Compute a relative costs of RTL insns.  This is necessary in order to
+   achieve better RTL insn splitting/combination result.  */
+
+static int
+xtensa_insn_cost (rtx_insn *insn, bool speed)
+{
+  if (!(recog_memoized (insn) < 0))
+    {
+      int len = get_attr_length (insn), n = (len + 2) / 3;
+
+      if (len == 0)
+	return COSTS_N_INSNS (0);
+
+      if (speed)  /* For speed cost.  */
+	{
+	  /* "L32R" may be particular slow (implementation-dependent).  */
+	  if (xtensa_is_insn_L32R_p (insn))
+	    return COSTS_N_INSNS (1 + xtensa_extra_l32r_costs);
+
+	  /* Cost based on the pipeline model.  */
+	  switch (get_attr_type (insn))
+	    {
+	    case TYPE_STORE:
+	    case TYPE_MOVE:
+	    case TYPE_ARITH:
+	    case TYPE_MULTI:
+	    case TYPE_NOP:
+	    case TYPE_FSTORE:
+	      return COSTS_N_INSNS (n);
+
+	    case TYPE_LOAD:
+	      return COSTS_N_INSNS (n - 1 + 2);
+
+	    case TYPE_JUMP:
+	    case TYPE_CALL:
+	      return COSTS_N_INSNS (n - 1 + 3);
+
+	    case TYPE_FCONV:
+	    case TYPE_FLOAD:
+	    case TYPE_MUL16:
+	    case TYPE_MUL32:
+	    case TYPE_RSR:
+	      return COSTS_N_INSNS (n * 2);
+
+	    case TYPE_FMADD:
+	      return COSTS_N_INSNS (n * 4);
+
+	    case TYPE_DIV32:
+	      return COSTS_N_INSNS (n * 16);
+
+	    default:
+	      break;
+	    }
+	}
+      else  /* For size cost.  */
+	{
+	  /* Cost based on the instruction length.  */
+	  if (get_attr_type (insn) != TYPE_UNKNOWN)
+	    {
+	      /* "L32R" itself plus constant in litpool.  */
+	      if (xtensa_is_insn_L32R_p (insn))
+		return COSTS_N_INSNS (2) + 1;
+
+	      /* Consider ".n" short instructions.  */
+	      return COSTS_N_INSNS (n) - (n * 3 - len);
+	    }
+	}
+    }
+
+  /* Fall back.  */
+  return pattern_cost (PATTERN (insn), speed);
+}
+
 /* Worker function for TARGET_RETURN_IN_MEMORY.  */
 
 static bool
@@ -4467,6 +4908,17 @@ xtensa_asan_shadow_offset (void)
   return HOST_WIDE_INT_UC (0x10000000);
 }
 
+/* Implement TARGET_FUNCTION_OK_FOR_SIBCALL.  */
+static bool
+xtensa_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED, tree exp ATTRIBUTE_UNUSED)
+{
+  /* Do not allow sibcalls when windowed registers ABI is in effect.  */
+  if (TARGET_WINDOWED_ABI)
+    return false;
+
+  return true;
+}
+
 static rtx
 xtensa_delegitimize_address (rtx op)
 {
diff --git a/gcc/config/xtensa/xtensa.h b/gcc/config/xtensa/xtensa.h
index 00e2930..d027a77 100644
--- a/gcc/config/xtensa/xtensa.h
+++ b/gcc/config/xtensa/xtensa.h
@@ -75,6 +75,11 @@ along with GCC; see the file COPYING3.  If not see
 #define HAVE_AS_TLS 0
 #endif
 
+/* Define this if the target has no hardware divide instructions.  */
+#if !TARGET_DIV32
+#define TARGET_HAS_NO_HW_DIVIDE
+#endif
+
 
 /* Target CPU builtins.  */
 #define TARGET_CPU_CPP_BUILTINS()					\
@@ -499,7 +504,7 @@ enum reg_class
    used for this purpose since all function arguments are pushed on
    the stack.  */
 #define FUNCTION_ARG_REGNO_P(N)						\
-  ((N) >= GP_OUTGOING_ARG_FIRST && (N) <= GP_OUTGOING_ARG_LAST)
+  IN_RANGE ((N), GP_OUTGOING_ARG_FIRST, GP_OUTGOING_ARG_LAST)
 
 /* Record the number of argument words seen so far, along with a flag to
    indicate whether these are incoming arguments.  (FUNCTION_INCOMING_ARG
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 3b61e5d..ef6bbc4 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -25,6 +25,7 @@
   (A7_REG		7)
   (A8_REG		8)
   (A9_REG		9)
+  (A10_REG		10)
 
   (UNSPEC_NOP		2)
   (UNSPEC_PLT		3)
@@ -83,6 +84,13 @@
 ;; the same template.
 (define_mode_iterator HQI [HI QI])
 
+;; This code iterator is for *shlrd and its variants.
+(define_code_iterator ior_op [ior plus])
+
+;; This mode iterator allows the DC and SC patterns to be defined from
+;; the same template.
+(define_mode_iterator DSC [DC SC])
+
 
 ;; Attributes.
 
@@ -98,7 +106,10 @@
 
 ;; Describe a user's asm statement.
 (define_asm_attributes
-  [(set_attr "type" "multi")])
+  [(set_attr "type"	"multi")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"3")])  ;; Should be the maximum possible length
+				;; of a single machine instruction.
 
 
 ;; Pipeline model.
@@ -224,20 +235,42 @@
 
 ;; Multiplication.
 
-(define_expand "<u>mulsidi3"
+(define_expand "mulsidi3"
   [(set (match_operand:DI 0 "register_operand")
-	(mult:DI (any_extend:DI (match_operand:SI 1 "register_operand"))
-		 (any_extend:DI (match_operand:SI 2 "register_operand"))))]
+	(mult:DI (sign_extend:DI (match_operand:SI 1 "register_operand"))
+		 (sign_extend:DI (match_operand:SI 2 "register_operand"))))]
   "TARGET_MUL32_HIGH"
 {
   rtx temp = gen_reg_rtx (SImode);
   emit_insn (gen_mulsi3 (temp, operands[1], operands[2]));
-  emit_insn (gen_<u>mulsi3_highpart (gen_highpart (SImode, operands[0]),
-				     operands[1], operands[2]));
+  emit_insn (gen_mulsi3_highpart (gen_highpart (SImode, operands[0]),
+				  operands[1], operands[2]));
   emit_insn (gen_movsi (gen_lowpart (SImode, operands[0]), temp));
   DONE;
 })
 
+(define_expand "umulsidi3"
+  [(set (match_operand:DI 0 "register_operand")
+	(mult:DI (zero_extend:DI (match_operand:SI 1 "register_operand"))
+		 (zero_extend:DI (match_operand:SI 2 "register_operand"))))]
+  ""
+{
+  if (TARGET_MUL32_HIGH)
+    {
+      rtx temp = gen_reg_rtx (SImode);
+      emit_insn (gen_mulsi3 (temp, operands[1], operands[2]));
+      emit_insn (gen_umulsi3_highpart (gen_highpart (SImode, operands[0]),
+				       operands[1], operands[2]));
+      emit_insn (gen_movsi (gen_lowpart (SImode, operands[0]), temp));
+    }
+  else
+    emit_library_call_value (gen_rtx_SYMBOL_REF (Pmode, "__umulsidi3"),
+			     operands[0], LCT_NORMAL, DImode,
+			     operands[1], SImode,
+			     operands[2], SImode);
+   DONE;
+})
+
 (define_insn "<u>mulsi3_highpart"
   [(set (match_operand:SI 0 "register_operand" "=a")
 	(truncate:SI
@@ -261,30 +294,16 @@
    (set_attr "mode"	"SI")
    (set_attr "length"	"3")])
 
-(define_insn "mulhisi3"
-  [(set (match_operand:SI 0 "register_operand" "=C,A")
-	(mult:SI (sign_extend:SI
-		  (match_operand:HI 1 "register_operand" "%r,r"))
-		 (sign_extend:SI
-		  (match_operand:HI 2 "register_operand" "r,r"))))]
-  "TARGET_MUL16 || TARGET_MAC16"
-  "@
-   mul16s\t%0, %1, %2
-   mul.aa.ll\t%1, %2"
-  [(set_attr "type"	"mul16,mac16")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
-
-(define_insn "umulhisi3"
+(define_insn "<u>mulhisi3"
   [(set (match_operand:SI 0 "register_operand" "=C,A")
-	(mult:SI (zero_extend:SI
+	(mult:SI (any_extend:SI
 		  (match_operand:HI 1 "register_operand" "%r,r"))
-		 (zero_extend:SI
+		 (any_extend:SI
 		  (match_operand:HI 2 "register_operand" "r,r"))))]
   "TARGET_MUL16 || TARGET_MAC16"
   "@
-   mul16u\t%0, %1, %2
-   umul.aa.ll\t%1, %2"
+   mul16<su>\t%0, %1, %2
+   <u>mul.aa.ll\t%1, %2"
   [(set_attr "type"	"mul16,mac16")
    (set_attr "mode"	"SI")
    (set_attr "length"	"3,3")])
@@ -429,7 +448,17 @@
    (set_attr "length"	"3")])
 
 
-;; Count leading/trailing zeros and find first bit.
+;; Count redundant leading sign bits and leading/trailing zeros,
+;; and find first bit.
+
+(define_insn "clrsbsi2"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(clrsb:SI (match_operand:SI 1 "register_operand" "r")))]
+  "TARGET_NSA"
+  "nsa\t%0, %1"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"3")])
 
 (define_insn "clzsi2"
   [(set (match_operand:SI 0 "register_operand" "=a")
@@ -471,23 +500,78 @@
 
 ;; Byte swap.
 
-(define_insn "bswapsi2"
-  [(set (match_operand:SI 0 "register_operand" "=&a")
-	(bswap:SI (match_operand:SI 1 "register_operand" "r")))]
-  "!optimize_size"
-  "ssai\t8\;srli\t%0, %1, 16\;src\t%0, %0, %1\;src\t%0, %0, %0\;src\t%0, %1, %0"
-  [(set_attr "type"	"arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"15")])
+(define_insn "bswaphi2"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(bswap:HI (match_operand:HI 1 "register_operand" "r")))
+   (clobber (match_scratch:HI 2 "=&a"))]
+  ""
+  "extui\t%2, %1, 8, 8\;slli\t%0, %1, 8\;or\t%0, %0, %2"
+   [(set_attr "type"	"arith")
+    (set_attr "mode"	"HI")
+    (set_attr "length"	"9")])
 
-(define_insn "bswapdi2"
-  [(set (match_operand:DI 0 "register_operand" "=&a")
-	(bswap:DI (match_operand:DI 1 "register_operand" "r")))]
-  "!optimize_size"
-  "ssai\t8\;srli\t%0, %D1, 16\;src\t%0, %0, %D1\;src\t%0, %0, %0\;src\t%0, %D1, %0\;srli\t%D0, %1, 16\;src\t%D0, %D0, %1\;src\t%D0, %D0, %D0\;src\t%D0, %1, %D0"
-  [(set_attr "type"	"arith")
-   (set_attr "mode"	"DI")
-   (set_attr "length"	"27")])
+(define_expand "bswapsi2"
+  [(set (match_operand:SI 0 "register_operand" "")
+        (bswap:SI (match_operand:SI 1 "register_operand" "")))]
+  "!optimize_debug && optimize > 1"
+{
+  /* GIMPLE manual byte-swapping recognition is now activated.
+     For both built-in and manual bswaps, emit corresponding library call
+     if optimizing for size, or a series of dedicated machine instructions
+     if otherwise.  */
+  if (optimize_size)
+    emit_library_call_value (optab_libfunc (bswap_optab, SImode),
+			     operands[0], LCT_NORMAL, SImode,
+			     operands[1], SImode);
+  else
+    emit_insn (gen_bswapsi2_internal (operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "bswapsi2_internal"
+  [(set (match_operand:SI 0 "register_operand" "=a,&a")
+	(bswap:SI (match_operand:SI 1 "register_operand" "0,r")))
+   (clobber (match_scratch:SI 2 "=&a,X"))]
+  "!optimize_debug && optimize > 1 && !optimize_size"
+{
+  rtx_insn *prev_insn = prev_nonnote_nondebug_insn (insn);
+  const char *init = "ssai\t8\;";
+  static char result[64];
+  if (prev_insn && NONJUMP_INSN_P (prev_insn))
+    {
+      rtx x = PATTERN (prev_insn);
+      if (GET_CODE (x) == PARALLEL && XVECLEN (x, 0) == 2
+	  && GET_CODE (XVECEXP (x, 0, 0)) == SET
+	  && GET_CODE (XVECEXP (x, 0, 1)) == CLOBBER)
+	{
+	  x = XEXP (XVECEXP (x, 0, 0), 1);
+	  if (GET_CODE (x) == BSWAP && GET_MODE (x) == SImode)
+	    init = "";
+	}
+    }
+  sprintf (result,
+	   (which_alternative == 0)
+	   ? "%s" "srli\t%%2, %%1, 16\;src\t%%2, %%2, %%1\;src\t%%2, %%2, %%2\;src\t%%0, %%1, %%2"
+	   : "%s" "srli\t%%0, %%1, 16\;src\t%%0, %%0, %%1\;src\t%%0, %%0, %%0\;src\t%%0, %%1, %%0",
+	   init);
+  return result;
+}
+   [(set_attr "type"	"arith,arith")
+    (set_attr "mode"	"SI")
+    (set_attr "length"	"15,15")])
+
+(define_expand "bswapdi2"
+  [(set (match_operand:DI 0 "register_operand" "")
+	(bswap:DI (match_operand:DI 1 "register_operand" "")))]
+  "!optimize_debug && optimize > 1 && optimize_size"
+{
+  /* Replace with a single DImode library call.
+     Without this, two SImode library calls are emitted.  */
+  emit_library_call_value (optab_libfunc (bswap_optab, DImode),
+			   operands[0], LCT_NORMAL, DImode,
+			   operands[1], DImode);
+  DONE;
+})
 
 
 ;; Negation and one's complement.
@@ -501,16 +585,26 @@
    (set_attr "mode"	"SI")
    (set_attr "length"	"3")])
 
-(define_expand "one_cmplsi2"
-  [(set (match_operand:SI 0 "register_operand" "")
-	(not:SI (match_operand:SI 1 "register_operand" "")))]
+(define_insn_and_split "one_cmplsi2"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(not:SI (match_operand:SI 1 "register_operand" "r")))]
   ""
+  "#"
+  "&& can_create_pseudo_p ()"
+  [(set (match_dup 2)
+	(const_int -1))
+   (set (match_dup 0)
+	(xor:SI (match_dup 1)
+		(match_dup 2)))]
 {
-  rtx temp = gen_reg_rtx (SImode);
-  emit_insn (gen_movsi (temp, constm1_rtx));
-  emit_insn (gen_xorsi3 (operands[0], temp, operands[1]));
-  DONE;
-})
+  operands[2] = gen_reg_rtx (SImode);
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY")
+		      (const_int 5)
+		      (const_int 6)))])
 
 (define_insn "negsf2"
   [(set (match_operand:SF 0 "register_operand" "=f")
@@ -536,6 +630,103 @@
    (set_attr "mode"	"SI")
    (set_attr "length"	"3,3")])
 
+(define_insn_and_split "*andsi3_bitcmpl"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(and:SI (not:SI (match_operand:SI 1 "register_operand" "r"))
+		(match_operand:SI 2 "register_operand" "r")))]
+  ""
+  "#"
+  "&& can_create_pseudo_p ()"
+  [(set (match_dup 3)
+	(and:SI (match_dup 1)
+		(match_dup 2)))
+   (set (match_dup 0)
+	(xor:SI (match_dup 3)
+		(match_dup 2)))]
+{
+  operands[3] = gen_reg_rtx (SImode);
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"6")])
+
+(define_insn_and_split "*andsi3_const_pow2_minus_one"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(and:SI (match_operand:SI 1 "register_operand" "r")
+		(match_operand:SI 2 "const_int_operand" "i")))]
+  "IN_RANGE (exact_log2 (INTVAL (operands[2]) + 1), 17, 31)"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(ashift:SI (match_dup 1)
+		   (match_dup 2)))
+   (set (match_dup 0)
+	(lshiftrt:SI (match_dup 0)
+		     (match_dup 2)))]
+{
+  operands[2] = GEN_INT (32 - floor_log2 (INTVAL (operands[2]) + 1));
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY
+				   && INTVAL (operands[2]) == 0x7FFFFFFF")
+		      (const_int 5)
+		      (const_int 6)))])
+
+(define_insn_and_split "*andsi3_const_negative_pow2"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(and:SI (match_operand:SI 1 "register_operand" "r")
+		(match_operand:SI 2 "const_int_operand" "i")))]
+  "IN_RANGE (exact_log2 (-INTVAL (operands[2])), 12, 31)"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(lshiftrt:SI (match_dup 1)
+		     (match_dup 2)))
+   (set (match_dup 0)
+	(ashift:SI (match_dup 0)
+		   (match_dup 2)))]
+{
+  operands[2] = GEN_INT (floor_log2 (-INTVAL (operands[2])));
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"6")])
+
+(define_insn_and_split "*andsi3_const_shifted_mask"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(and:SI (match_operand:SI 1 "register_operand" "r")
+		(match_operand:SI 2 "shifted_mask_operand" "i")))]
+  "! xtensa_simm12b (INTVAL (operands[2]))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(zero_extract:SI (match_dup 1)
+			 (match_dup 3)
+			 (match_dup 4)))
+   (set (match_dup 0)
+	(ashift:SI (match_dup 0)
+		   (match_dup 2)))]
+{
+  HOST_WIDE_INT mask = INTVAL (operands[2]);
+  int shift = ctz_hwi (mask);
+  int mask_size = floor_log2 (((uint32_t)mask >> shift) + 1);
+  int mask_pos = shift;
+  if (BITS_BIG_ENDIAN)
+    mask_pos = (32 - (mask_size + shift)) & 0x1f;
+  operands[2] = GEN_INT (shift);
+  operands[3] = GEN_INT (mask_size);
+  operands[4] = GEN_INT (mask_pos);
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY
+				   && ctz_hwi (INTVAL (operands[2])) == 1")
+		      (const_int 5)
+		      (const_int 6)))])
+
 (define_insn "iorsi3"
   [(set (match_operand:SI 0 "register_operand" "=a")
 	(ior:SI (match_operand:SI 1 "register_operand" "%r")
@@ -631,7 +822,7 @@
 
 ;; Field extract instructions.
 
-(define_expand "extv"
+(define_expand "extvsi"
   [(set (match_operand:SI 0 "register_operand" "")
 	(sign_extract:SI (match_operand:SI 1 "register_operand" "")
 			 (match_operand:SI 2 "const_int_operand" "")
@@ -646,12 +837,12 @@
   if (!lsbitnum_operand (operands[3], SImode))
     FAIL;
 
-  emit_insn (gen_extv_internal (operands[0], operands[1],
-				operands[2], operands[3]));
+  emit_insn (gen_extvsi_internal (operands[0], operands[1],
+				  operands[2], operands[3]));
   DONE;
 })
 
-(define_insn "extv_internal"
+(define_insn "extvsi_internal"
   [(set (match_operand:SI 0 "register_operand" "=a")
 	(sign_extract:SI (match_operand:SI 1 "register_operand" "r")
 			 (match_operand:SI 2 "sext_fldsz_operand" "i")
@@ -666,7 +857,7 @@
    (set_attr "mode"	"SI")
    (set_attr "length"	"3")])
 
-(define_expand "extzv"
+(define_expand "extzvsi"
   [(set (match_operand:SI 0 "register_operand" "")
 	(zero_extract:SI (match_operand:SI 1 "register_operand" "")
 			 (match_operand:SI 2 "const_int_operand" "")
@@ -675,12 +866,12 @@
 {
   if (!extui_fldsz_operand (operands[2], SImode))
     FAIL;
-  emit_insn (gen_extzv_internal (operands[0], operands[1],
-				 operands[2], operands[3]));
+  emit_insn (gen_extzvsi_internal (operands[0], operands[1],
+				   operands[2], operands[3]));
   DONE;
 })
 
-(define_insn "extzv_internal"
+(define_insn "extzvsi_internal"
   [(set (match_operand:SI 0 "register_operand" "=a")
 	(zero_extract:SI (match_operand:SI 1 "register_operand" "r")
 			 (match_operand:SI 2 "extui_fldsz_operand" "i")
@@ -754,11 +945,14 @@
 	 because of offering further optimization opportunities.  */
       if (register_operand (operands[0], DImode))
 	{
-	  rtx first, second;
-
-	  split_double (operands[1], &first, &second);
-	  emit_insn (gen_movsi (gen_lowpart (SImode, operands[0]), first));
-	  emit_insn (gen_movsi (gen_highpart (SImode, operands[0]), second));
+	  rtx lowpart, highpart;
+
+	  if (TARGET_BIG_ENDIAN)
+	    split_double (operands[1], &highpart, &lowpart);
+	  else
+	    split_double (operands[1], &lowpart, &highpart);
+	  emit_insn (gen_movsi (gen_lowpart (SImode, operands[0]), lowpart));
+	  emit_insn (gen_movsi (gen_highpart (SImode, operands[0]), highpart));
 	  DONE;
 	}
 
@@ -828,6 +1022,19 @@
    (set_attr "mode"	"SI")
    (set_attr "length"	"2,2,2,2,2,2,3,3,3,3,6,3,3,3,3,3")])
 
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(match_operand:SI 1 "constantpool_operand"))]
+  "! optimize_debug && reload_completed"
+  [(const_int 0)]
+{
+  rtx x = avoid_constant_pool_reference (operands[1]);
+  if (! CONST_INT_P (x))
+    FAIL;
+  if (! xtensa_constantsynth (operands[0], INTVAL (x)))
+    emit_move_insn (operands[0], x);
+})
+
 ;; 16-bit Integer moves
 
 (define_expand "movhi"
@@ -1030,6 +1237,43 @@
    (set_attr "mode"	"SF")
    (set_attr "length"	"3")])
 
+(define_split
+  [(set (match_operand:SF 0 "register_operand")
+	(match_operand:SF 1 "constantpool_operand"))]
+  "! optimize_debug && reload_completed"
+  [(const_int 0)]
+{
+  int i = 0;
+  rtx x = XEXP (operands[1], 0);
+  long l[2];
+  if (GET_CODE (x) == SYMBOL_REF
+      && CONSTANT_POOL_ADDRESS_P (x))
+    x = get_pool_constant (x);
+  else if (GET_CODE (x) == CONST)
+    {
+      x = XEXP (x, 0);
+      gcc_assert (GET_CODE (x) == PLUS
+		  && GET_CODE (XEXP (x, 0)) == SYMBOL_REF
+		  && CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))
+		  && CONST_INT_P (XEXP (x, 1)));
+      i = INTVAL (XEXP (x, 1));
+      gcc_assert (i == 0 || i == 4);
+      i /= 4;
+      x = get_pool_constant (XEXP (x, 0));
+    }
+  else
+    gcc_unreachable ();
+  if (GET_MODE (x) == SFmode)
+    REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l[0]);
+  else if (GET_MODE (x) == DFmode)
+    REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
+  else
+    FAIL;
+  x = gen_rtx_REG (SImode, REGNO (operands[0]));
+  if (! xtensa_constantsynth (x, l[i]))
+    emit_move_insn (x, GEN_INT (l[i]));
+})
+
 ;; 64-bit floating point moves
 
 (define_expand "movdf"
@@ -1080,6 +1324,22 @@
   DONE;
 })
 
+;; Block sets
+
+(define_expand "setmemsi"
+  [(match_operand:BLK 0 "memory_operand")
+   (match_operand:SI 1 "")
+   (match_operand:SI 2 "")
+   (match_operand:SI 3 "const_int_operand")]
+  "!optimize_debug && optimize"
+{
+  if (xtensa_expand_block_set_unrolled_loop (operands))
+    DONE;
+  if (xtensa_expand_block_set_small_loop (operands))
+    DONE;
+  FAIL;
+})
+
 
 ;; Shift instructions.
 
@@ -1092,16 +1352,6 @@
   operands[1] = xtensa_copy_incoming_a7 (operands[1]);
 })
 
-(define_insn "*ashlsi3_1"
-  [(set (match_operand:SI 0 "register_operand" "=a")
-	(ashift:SI (match_operand:SI 1 "register_operand" "r")
-		   (const_int 1)))]
-  "TARGET_DENSITY"
-  "add.n\t%0, %1, %1"
-  [(set_attr "type"	"arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"2")])
-
 (define_insn "ashlsi3_internal"
   [(set (match_operand:SI 0 "register_operand" "=a,a")
 	(ashift:SI (match_operand:SI 1 "register_operand" "r,r")
@@ -1114,16 +1364,14 @@
    (set_attr "mode"	"SI")
    (set_attr "length"	"3,6")])
 
-(define_insn "*ashlsi3_3x"
-  [(set (match_operand:SI 0 "register_operand" "=a")
-	(ashift:SI (match_operand:SI 1 "register_operand" "r")
-		   (ashift:SI (match_operand:SI 2 "register_operand" "r")
-			      (const_int 3))))]
-  ""
-  "ssa8b\t%2\;sll\t%0, %1"
-  [(set_attr "type"	"arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"6")])
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(ashift:SI (match_operand:SI 1 "register_operand")
+		   (const_int 1)))]
+  "TARGET_DENSITY"
+  [(set (match_dup 0)
+	(plus:SI (match_dup 1)
+		 (match_dup 1)))])
 
 (define_insn "ashrsi3"
   [(set (match_operand:SI 0 "register_operand" "=a,a")
@@ -1137,17 +1385,6 @@
    (set_attr "mode"	"SI")
    (set_attr "length"	"3,6")])
 
-(define_insn "*ashrsi3_3x"
-  [(set (match_operand:SI 0 "register_operand" "=a")
-	(ashiftrt:SI (match_operand:SI 1 "register_operand" "r")
-		     (ashift:SI (match_operand:SI 2 "register_operand" "r")
-				(const_int 3))))]
-  ""
-  "ssa8l\t%2\;sra\t%0, %1"
-  [(set_attr "type"	"arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"6")])
-
 (define_insn "lshrsi3"
   [(set (match_operand:SI 0 "register_operand" "=a,a")
 	(lshiftrt:SI (match_operand:SI 1 "register_operand" "r,r")
@@ -1157,9 +1394,9 @@
   if (which_alternative == 0)
     {
       if ((INTVAL (operands[2]) & 0x1f) < 16)
-        return "srli\t%0, %1, %R2";
+	return "srli\t%0, %1, %R2";
       else
-      	return "extui\t%0, %1, %R2, %L2";
+	return "extui\t%0, %1, %R2, %L2";
     }
   return "ssr\t%2\;srl\t%0, %1";
 }
@@ -1167,13 +1404,170 @@
    (set_attr "mode"	"SI")
    (set_attr "length"	"3,6")])
 
-(define_insn "*lshrsi3_3x"
+(define_insn "*shift_per_byte"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(match_operator:SI 3 "xtensa_shift_per_byte_operator"
+		[(match_operand:SI 1 "register_operand" "r")
+		 (ashift:SI (match_operand:SI 2 "register_operand" "r")
+			    (const_int 3))]))]
+  "!optimize_debug && optimize"
+{
+  switch (GET_CODE (operands[3]))
+    {
+    case ASHIFT:	return "ssa8b\t%2\;sll\t%0, %1";
+    case ASHIFTRT:	return "ssa8l\t%2\;sra\t%0, %1";
+    case LSHIFTRT:	return "ssa8l\t%2\;srl\t%0, %1";
+    default:		gcc_unreachable ();
+    }
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"6")])
+
+(define_insn_and_split "*shift_per_byte_omit_AND_0"
   [(set (match_operand:SI 0 "register_operand" "=a")
-	(lshiftrt:SI (match_operand:SI 1 "register_operand" "r")
-		     (ashift:SI (match_operand:SI 2 "register_operand" "r")
-				(const_int 3))))]
+	(match_operator:SI 4 "xtensa_shift_per_byte_operator"
+		[(match_operand:SI 1 "register_operand" "r")
+		 (and:SI (ashift:SI (match_operand:SI 2 "register_operand" "r")
+				    (const_int 3))
+			 (match_operand:SI 3 "const_int_operand" "i"))]))]
+  "!optimize_debug && optimize
+   && (INTVAL (operands[3]) & 0x1f) == 3 << 3"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(match_op_dup 4
+		[(match_dup 1)
+		 (ashift:SI (match_dup 2)
+			    (const_int 3))]))]
+  ""
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"6")])
+
+(define_insn_and_split "*shift_per_byte_omit_AND_1"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(match_operator:SI 4 "xtensa_shift_per_byte_operator"
+		[(match_operand:SI 1 "register_operand" "r")
+		 (neg:SI (and:SI (ashift:SI (match_operand:SI 2 "register_operand" "r")
+					    (const_int 3))
+				 (match_operand:SI 3 "const_int_operand" "i")))]))]
+  "!optimize_debug && optimize
+   && (INTVAL (operands[3]) & 0x1f) == 3 << 3"
+  "#"
+  "&& can_create_pseudo_p ()"
+  [(set (match_dup 5)
+	(neg:SI (match_dup 2)))
+   (set (match_dup 0)
+	(match_op_dup 4
+		[(match_dup 1)
+		 (ashift:SI (match_dup 5)
+			    (const_int 3))]))]
+{
+  operands[5] = gen_reg_rtx (SImode);
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"9")])
+
+(define_insn "*shlrd_reg_<code>"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(ior_op:SI (match_operator:SI 4 "logical_shift_operator"
+			[(match_operand:SI 1 "register_operand" "r")
+			 (match_operand:SI 2 "register_operand" "r")])
+		   (match_operator:SI 5 "logical_shift_operator"
+			[(match_operand:SI 3 "register_operand" "r")
+			 (neg:SI (match_dup 2))])))]
+  "!optimize_debug && optimize
+   && xtensa_shlrd_which_direction (operands[4], operands[5]) != UNKNOWN"
+{
+  switch (xtensa_shlrd_which_direction (operands[4], operands[5]))
+    {
+    case ASHIFT:	return "ssl\t%2\;src\t%0, %1, %3";
+    case LSHIFTRT:	return "ssr\t%2\;src\t%0, %3, %1";
+    default:		gcc_unreachable ();
+    }
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"6")])
+
+(define_insn "*shlrd_const_<code>"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(ior_op:SI (match_operator:SI 5 "logical_shift_operator"
+			[(match_operand:SI 1 "register_operand" "r")
+			 (match_operand:SI 3 "const_int_operand" "i")])
+		   (match_operator:SI 6 "logical_shift_operator"
+			[(match_operand:SI 2 "register_operand" "r")
+			 (match_operand:SI 4 "const_int_operand" "i")])))]
+  "!optimize_debug && optimize
+   && xtensa_shlrd_which_direction (operands[5], operands[6]) != UNKNOWN
+   && IN_RANGE (INTVAL (operands[3]), 1, 31)
+   && IN_RANGE (INTVAL (operands[4]), 1, 31)
+   && INTVAL (operands[3]) + INTVAL (operands[4]) == 32"
+{
+  switch (xtensa_shlrd_which_direction (operands[5], operands[6]))
+    {
+    case ASHIFT:	return "ssai\t%L3\;src\t%0, %1, %2";
+    case LSHIFTRT:	return "ssai\t%R3\;src\t%0, %2, %1";
+    default:		gcc_unreachable ();
+    }
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"6")])
+
+(define_insn "*shlrd_per_byte_<code>"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(ior_op:SI (match_operator:SI 4 "logical_shift_operator"
+			[(match_operand:SI 1 "register_operand" "r")
+			 (ashift:SI (match_operand:SI 2 "register_operand" "r")
+				    (const_int 3))])
+		   (match_operator:SI 5 "logical_shift_operator"
+			[(match_operand:SI 3 "register_operand" "r")
+			 (neg:SI (ashift:SI (match_dup 2)
+					    (const_int 3)))])))]
+  "!optimize_debug && optimize
+   && xtensa_shlrd_which_direction (operands[4], operands[5]) != UNKNOWN"
+{
+  switch (xtensa_shlrd_which_direction (operands[4], operands[5]))
+    {
+    case ASHIFT:	return "ssa8b\t%2\;src\t%0, %1, %3";
+    case LSHIFTRT:	return "ssa8l\t%2\;src\t%0, %3, %1";
+    default:		gcc_unreachable ();
+    }
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"6")])
+
+(define_insn_and_split "*shlrd_per_byte_<code>_omit_AND"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(ior_op:SI (match_operator:SI 5 "logical_shift_operator"
+			[(match_operand:SI 1 "register_operand" "r")
+			 (and:SI (ashift:SI (match_operand:SI 2 "register_operand" "r")
+					    (const_int 3))
+				 (match_operand:SI 4 "const_int_operand" "i"))])
+		   (match_operator:SI 6 "logical_shift_operator"
+			[(match_operand:SI 3 "register_operand" "r")
+			 (neg:SI (and:SI (ashift:SI (match_dup 2)
+						    (const_int 3))
+					 (match_dup 4)))])))]
+  "!optimize_debug && optimize
+   && xtensa_shlrd_which_direction (operands[5], operands[6]) != UNKNOWN
+   && (INTVAL (operands[4]) & 0x1f) == 3 << 3"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(ior_op:SI (match_op_dup 5
+			[(match_dup 1)
+			 (ashift:SI (match_dup 2)
+				    (const_int 3))])
+		   (match_op_dup 6
+			[(match_dup 3)
+			 (neg:SI (ashift:SI (match_dup 2)
+					    (const_int 3)))])))]
   ""
-  "ssa8l\t%2\;srl\t%0, %1"
   [(set_attr "type"	"arith")
    (set_attr "mode"	"SI")
    (set_attr "length"	"6")])
@@ -1234,28 +1628,13 @@
 (define_insn "*btrue"
   [(set (pc)
 	(if_then_else (match_operator 3 "branch_operator"
-		       [(match_operand:SI 0 "register_operand" "r,r")
-			(match_operand:SI 1 "branch_operand" "K,r")])
+			[(match_operand:SI 0 "register_operand" "r,r")
+			 (match_operand:SI 1 "branch_operand" "K,r")])
 		      (label_ref (match_operand 2 "" ""))
 		      (pc)))]
   ""
 {
-  return xtensa_emit_branch (false, which_alternative == 0, operands);
-}
-  [(set_attr "type"	"jump,jump")
-   (set_attr "mode"	"none")
-   (set_attr "length"	"3,3")])
-
-(define_insn "*bfalse"
-  [(set (pc)
-	(if_then_else (match_operator 3 "branch_operator"
-		       [(match_operand:SI 0 "register_operand" "r,r")
-			(match_operand:SI 1 "branch_operand" "K,r")])
-		      (pc)
-		      (label_ref (match_operand 2 "" ""))))]
-  ""
-{
-  return xtensa_emit_branch (true, which_alternative == 0, operands);
+  return xtensa_emit_branch (which_alternative == 0, operands);
 }
   [(set_attr "type"	"jump,jump")
    (set_attr "mode"	"none")
@@ -1264,28 +1643,13 @@
 (define_insn "*ubtrue"
   [(set (pc)
 	(if_then_else (match_operator 3 "ubranch_operator"
-		       [(match_operand:SI 0 "register_operand" "r,r")
-			(match_operand:SI 1 "ubranch_operand" "L,r")])
+			[(match_operand:SI 0 "register_operand" "r,r")
+			 (match_operand:SI 1 "ubranch_operand" "L,r")])
 		      (label_ref (match_operand 2 "" ""))
 		      (pc)))]
   ""
 {
-  return xtensa_emit_branch (false, which_alternative == 0, operands);
-}
-  [(set_attr "type"	"jump,jump")
-   (set_attr "mode"	"none")
-   (set_attr "length"	"3,3")])
-
-(define_insn "*ubfalse"
-  [(set (pc)
-	(if_then_else (match_operator 3 "ubranch_operator"
-			 [(match_operand:SI 0 "register_operand" "r,r")
-			  (match_operand:SI 1 "ubranch_operand" "L,r")])
-		      (pc)
-		      (label_ref (match_operand 2 "" ""))))]
-  ""
-{
-  return xtensa_emit_branch (true, which_alternative == 0, operands);
+  return xtensa_emit_branch (which_alternative == 0, operands);
 }
   [(set_attr "type"	"jump,jump")
    (set_attr "mode"	"none")
@@ -1296,80 +1660,178 @@
 (define_insn "*bittrue"
   [(set (pc)
 	(if_then_else (match_operator 3 "boolean_operator"
-			[(zero_extract:SI
-			    (match_operand:SI 0 "register_operand" "r,r")
-			    (const_int 1)
-			    (match_operand:SI 1 "arith_operand" "J,r"))
+			[(zero_extract:SI (match_operand:SI 0 "register_operand" "r,r")
+					  (const_int 1)
+					  (match_operand:SI 1 "arith_operand" "J,r"))
 			 (const_int 0)])
 		      (label_ref (match_operand 2 "" ""))
 		      (pc)))]
   ""
 {
-  return xtensa_emit_bit_branch (false, which_alternative == 0, operands);
+  static char result[64];
+  char op;
+  switch (GET_CODE (operands[3]))
+    {
+    case EQ:	op = 'c'; break;
+    case NE:	op = 's'; break;
+    default:	gcc_unreachable ();
+    }
+  if (which_alternative == 0)
+    {
+      operands[1] = GEN_INT (INTVAL (operands[1]) & 0x1f);
+      sprintf (result, "bb%ci\t%%0, %%d1, %%2", op);
+    }
+  else
+    sprintf (result, "bb%c\t%%0, %%1, %%2", op);
+  return result;
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
 
-(define_insn "*bitfalse"
+(define_insn "*masktrue"
   [(set (pc)
 	(if_then_else (match_operator 3 "boolean_operator"
-			[(zero_extract:SI
-			    (match_operand:SI 0 "register_operand" "r,r")
-			    (const_int 1)
-			    (match_operand:SI 1 "arith_operand" "J,r"))
+			[(and:SI (match_operand:SI 0 "register_operand" "r")
+				 (match_operand:SI 1 "register_operand" "r"))
 			 (const_int 0)])
-		      (pc)
-		      (label_ref (match_operand 2 "" ""))))]
+		      (label_ref (match_operand 2 "" ""))
+		      (pc)))]
   ""
 {
-  return xtensa_emit_bit_branch (true, which_alternative == 0, operands);
+  switch (GET_CODE (operands[3]))
+    {
+    case EQ:	return "bnone\t%0, %1, %2";
+    case NE:	return "bany\t%0, %1, %2";
+    default:	gcc_unreachable ();
+    }
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
 
-(define_insn "*masktrue"
+(define_insn "*masktrue_bitcmpl"
   [(set (pc)
 	(if_then_else (match_operator 3 "boolean_operator"
-		 [(and:SI (match_operand:SI 0 "register_operand" "r")
-			  (match_operand:SI 1 "register_operand" "r"))
-		  (const_int 0)])
+			[(and:SI (not:SI (match_operand:SI 0 "register_operand" "r"))
+				 (match_operand:SI 1 "register_operand" "r"))
+			 (const_int 0)])
 		      (label_ref (match_operand 2 "" ""))
 		      (pc)))]
   ""
 {
   switch (GET_CODE (operands[3]))
     {
-    case EQ:		return "bnone\t%0, %1, %2";
-    case NE:		return "bany\t%0, %1, %2";
-    default:		gcc_unreachable ();
+    case EQ:	return "ball\t%0, %1, %2";
+    case NE:	return "bnall\t%0, %1, %2";
+    default:	gcc_unreachable ();
     }
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
 
-(define_insn "*maskfalse"
+(define_insn_and_split "*masktrue_const_pow2_minus_one"
   [(set (pc)
 	(if_then_else (match_operator 3 "boolean_operator"
-		 [(and:SI (match_operand:SI 0 "register_operand" "r")
-			  (match_operand:SI 1 "register_operand" "r"))
-		  (const_int 0)])
-		      (pc)
-		      (label_ref (match_operand 2 "" ""))))]
-  ""
+			[(and:SI (match_operand:SI 0 "register_operand" "r")
+				 (match_operand:SI 1 "const_int_operand" "i"))
+			 (const_int 0)])
+		      (label_ref (match_operand 2 "" ""))
+		      (pc)))]
+  "IN_RANGE (exact_log2 (INTVAL (operands[1]) + 1), 17, 31)"
+  "#"
+  "&& can_create_pseudo_p ()"
+  [(set (match_dup 4)
+	(ashift:SI (match_dup 0)
+		   (match_dup 1)))
+   (set (pc)
+	(if_then_else (match_op_dup 3
+			[(match_dup 4)
+			 (const_int 0)])
+		      (label_ref (match_dup 2))
+		      (pc)))]
 {
-  switch (GET_CODE (operands[3]))
-    {
-    case EQ:		return "bany\t%0, %1, %2";
-    case NE:		return "bnone\t%0, %1, %2";
-    default:		gcc_unreachable ();
-    }
+  operands[1] = GEN_INT (32 - floor_log2 (INTVAL (operands[1]) + 1));
+  operands[4] = gen_reg_rtx (SImode);
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
-   (set_attr "length"	"3")])
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY
+				   && INTVAL (operands[1]) == 0x7FFFFFFF")
+		      (const_int 5)
+		      (const_int 6)))])
+
+(define_insn_and_split "*masktrue_const_negative_pow2"
+  [(set (pc)
+	(if_then_else (match_operator 3 "boolean_operator"
+			[(and:SI (match_operand:SI 0 "register_operand" "r")
+				 (match_operand:SI 1 "const_int_operand" "i"))
+			 (const_int 0)])
+		      (label_ref (match_operand 2 "" ""))
+		      (pc)))]
+  "IN_RANGE (exact_log2 (-INTVAL (operands[1])), 12, 30)"
+  "#"
+  "&& can_create_pseudo_p ()"
+  [(set (match_dup 4)
+	(lshiftrt:SI (match_dup 0)
+		     (match_dup 1)))
+   (set (pc)
+	(if_then_else (match_op_dup 3
+			[(match_dup 4)
+			 (const_int 0)])
+		      (label_ref (match_dup 2))
+		      (pc)))]
+{
+  operands[1] = GEN_INT (floor_log2 (-INTVAL (operands[1])));
+  operands[4] = gen_reg_rtx (SImode);
+}
+  [(set_attr "type"	"jump")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"6")])
+
+(define_insn_and_split "*masktrue_const_shifted_mask"
+  [(set (pc)
+	(if_then_else (match_operator 4 "boolean_operator"
+			[(and:SI (match_operand:SI 0 "register_operand" "r")
+				 (match_operand:SI 1 "shifted_mask_operand" "i"))
+			 (match_operand:SI 2 "const_int_operand" "i")])
+		      (label_ref (match_operand 3 "" ""))
+		      (pc)))]
+  "(INTVAL (operands[2]) & ((1 << ctz_hwi (INTVAL (operands[1]))) - 1)) == 0
+   && xtensa_b4const_or_zero ((uint32_t)INTVAL (operands[2]) >> ctz_hwi (INTVAL (operands[1])))"
+  "#"
+  "&& can_create_pseudo_p ()"
+  [(set (match_dup 6)
+	(zero_extract:SI (match_dup 0)
+			 (match_dup 5)
+			 (match_dup 1)))
+   (set (pc)
+	(if_then_else (match_op_dup 4
+			[(match_dup 6)
+			 (match_dup 2)])
+		      (label_ref (match_dup 3))
+		      (pc)))]
+{
+  HOST_WIDE_INT mask = INTVAL (operands[1]);
+  int shift = ctz_hwi (mask);
+  int mask_size = floor_log2 (((uint32_t)mask >> shift) + 1);
+  int mask_pos = shift;
+  if (BITS_BIG_ENDIAN)
+    mask_pos = (32 - (mask_size + shift)) & 0x1f;
+  operands[1] = GEN_INT (mask_pos);
+  operands[2] = GEN_INT ((uint32_t)INTVAL (operands[2]) >> shift);
+  operands[5] = GEN_INT (mask_size);
+  operands[6] = gen_reg_rtx (SImode);
+}
+  [(set_attr "type"	"jump")
+   (set_attr "mode"	"none")
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY
+				   && (uint32_t)INTVAL (operands[2]) >> ctz_hwi (INTVAL (operands[1])) == 0")
+		      (const_int 5)
+		      (const_int 6)))])
 
 
 ;; Zero-overhead looping support.
@@ -1691,18 +2153,13 @@
 	 (match_operand 1 "" ""))]
   ""
 {
-  rtx addr = XEXP (operands[0], 0);
-  if (flag_pic && GET_CODE (addr) == SYMBOL_REF
-      && (!SYMBOL_REF_LOCAL_P (addr) || SYMBOL_REF_EXTERNAL_P (addr)))
-    addr = gen_sym_PLT (addr);
-  if (!call_insn_operand (addr, VOIDmode))
-    XEXP (operands[0], 0) = copy_to_mode_reg (Pmode, addr);
+  xtensa_prepare_expand_call (0, operands);
 })
 
 (define_insn "call_internal"
   [(call (mem (match_operand:SI 0 "call_insn_operand" "nir"))
 	 (match_operand 1 "" "i"))]
-  ""
+  "!SIBLING_CALL_P (insn)"
 {
   return xtensa_emit_call (0, operands);
 }
@@ -1716,19 +2173,14 @@
 	      (match_operand 2 "" "")))]
   ""
 {
-  rtx addr = XEXP (operands[1], 0);
-  if (flag_pic && GET_CODE (addr) == SYMBOL_REF
-      && (!SYMBOL_REF_LOCAL_P (addr) || SYMBOL_REF_EXTERNAL_P (addr)))
-    addr = gen_sym_PLT (addr);
-  if (!call_insn_operand (addr, VOIDmode))
-    XEXP (operands[1], 0) = copy_to_mode_reg (Pmode, addr);
+  xtensa_prepare_expand_call (1, operands);
 })
 
 (define_insn "call_value_internal"
   [(set (match_operand 0 "register_operand" "=a")
         (call (mem (match_operand:SI 1 "call_insn_operand" "nir"))
               (match_operand 2 "" "i")))]
-  ""
+  "!SIBLING_CALL_P (insn)"
 {
   return xtensa_emit_call (1, operands);
 }
@@ -1736,6 +2188,70 @@
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
 
+(define_expand "sibcall"
+  [(call (match_operand 0 "memory_operand" "")
+	 (match_operand 1 "" ""))]
+  "!TARGET_WINDOWED_ABI"
+{
+  xtensa_prepare_expand_call (0, operands);
+})
+
+(define_insn "sibcall_internal"
+  [(call (mem:SI (match_operand:SI 0 "call_insn_operand" "nir"))
+	 (match_operand 1 "" "i"))]
+  "!TARGET_WINDOWED_ABI && SIBLING_CALL_P (insn)"
+{
+  return xtensa_emit_sibcall (0, operands);
+}
+  [(set_attr "type"	"call")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"3")])
+
+(define_split
+  [(call (mem:SI (match_operand:SI 0 "register_operand"))
+	 (match_operand 1 ""))]
+  "reload_completed
+   && !TARGET_WINDOWED_ABI && SIBLING_CALL_P (insn)
+   && IN_RANGE (REGNO (operands[0]), 12, 15)"
+  [(set (reg:SI A10_REG)
+	(match_dup 0))
+   (call (mem:SI (reg:SI A10_REG))
+	 (match_dup 1))])
+
+(define_expand "sibcall_value"
+  [(set (match_operand 0 "register_operand" "")
+	(call (match_operand 1 "memory_operand" "")
+	      (match_operand 2 "" "")))]
+  "!TARGET_WINDOWED_ABI"
+{
+  xtensa_prepare_expand_call (1, operands);
+})
+
+(define_insn "sibcall_value_internal"
+  [(set (match_operand 0 "register_operand" "=a")
+	(call (mem:SI (match_operand:SI 1 "call_insn_operand" "nir"))
+	      (match_operand 2 "" "i")))]
+  "!TARGET_WINDOWED_ABI && SIBLING_CALL_P (insn)"
+{
+  return xtensa_emit_sibcall (1, operands);
+}
+  [(set_attr "type"	"call")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"3")])
+
+(define_split
+  [(set (match_operand 0 "register_operand")
+	(call (mem:SI (match_operand:SI 1 "register_operand"))
+	      (match_operand 2 "")))]
+  "reload_completed
+   && !TARGET_WINDOWED_ABI && SIBLING_CALL_P (insn)
+   && IN_RANGE (REGNO (operands[1]), 12, 15)"
+  [(set (reg:SI A10_REG)
+	(match_dup 1))
+   (set (match_dup 0)
+	(call (mem:SI (reg:SI A10_REG))
+	      (match_dup 2)))])
+
 (define_insn "entry"
   [(set (reg:SI A1_REG)
 	(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "i")]
@@ -1757,7 +2273,10 @@
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
-   (set_attr "length"	"2")])
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY")
+		      (const_int 2)
+		      (const_int 3)))])
 
 
 ;; Miscellaneous instructions.
@@ -1800,7 +2319,15 @@
   [(return)]
   ""
 {
-  xtensa_expand_epilogue ();
+  xtensa_expand_epilogue (false);
+  DONE;
+})
+
+(define_expand "sibcall_epilogue"
+  [(return)]
+  "!TARGET_WINDOWED_ABI"
+{
+  xtensa_expand_epilogue (true);
   DONE;
 })
 
@@ -1812,7 +2339,10 @@
 }
   [(set_attr "type"	"nop")
    (set_attr "mode"	"none")
-   (set_attr "length"	"3")])
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY")
+		      (const_int 2)
+		      (const_int 3)))])
 
 (define_expand "nonlocal_goto"
   [(match_operand:SI 0 "general_operand" "")
@@ -1876,8 +2406,9 @@
   [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
   ""
   ""
-  [(set_attr "length" "0")
-   (set_attr "type" "nop")])
+  [(set_attr "type"	"nop")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"0")])
 
 ;; Do not schedule instructions accessing memory before this point.
 
@@ -1896,7 +2427,9 @@
         (unspec:BLK [(match_operand:SI 1 "" "")] UNSPEC_FRAME_BLOCKAGE))]
   ""
   ""
-  [(set_attr "length" "0")])
+  [(set_attr "type"	"nop")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"0")])
 
 (define_insn "trap"
   [(trap_if (const_int 1) (const_int 0))]
@@ -1909,7 +2442,10 @@
 }
   [(set_attr "type"	"trap")
    (set_attr "mode"	"none")
-   (set_attr "length"	"3")])
+   (set (attr "length")
+	(if_then_else (match_test "!TARGET_DEBUG && TARGET_DENSITY")
+		      (const_int 2)
+		      (const_int 3)))])
 
 ;; Setting up a frame pointer is tricky for Xtensa because GCC doesn't
 ;; know if a frame pointer is required until the reload pass, and
@@ -2172,3 +2708,103 @@
   xtensa_expand_atomic (<CODE>, operands[0], operands[1], operands[2], true);
   DONE;
 })
+
+(define_insn_and_split "*round_up_to_even"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(and:SI (plus:SI (match_operand:SI 1 "register_operand" "r")
+			 (const_int 1))
+		(const_int -2)))]
+  ""
+  "#"
+  "can_create_pseudo_p ()"
+  [(set (match_dup 2)
+	(and:SI (match_dup 1)
+		(const_int 1)))
+   (set (match_dup 0)
+	(plus:SI (match_dup 2)
+		 (match_dup 1)))]
+{
+  operands[2] = gen_reg_rtx (SImode);
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY")
+		      (const_int 5)
+		      (const_int 6)))])
+
+(define_insn_and_split "*signed_ge_zero"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(ge:SI (match_operand:SI 1 "register_operand" "r")
+	       (const_int 0)))]
+  ""
+  "#"
+  ""
+  [(set (match_dup 0)
+	(ashiftrt:SI (match_dup 1)
+		     (const_int 31)))
+   (set (match_dup 0)
+	(plus:SI (match_dup 0)
+		 (const_int 1)))]
+  ""
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"SI")
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY")
+		      (const_int 5)
+		      (const_int 6)))])
+
+(define_peephole2
+  [(set (match_operand:SI 0 "register_operand")
+	(match_operand:SI 6 "reload_operand"))
+   (set (match_operand:SI 1 "register_operand")
+	(match_operand:SI 7 "reload_operand"))
+   (set (match_operand:SF 2 "register_operand")
+	(match_operand:SF 4 "register_operand"))
+   (set (match_operand:SF 3 "register_operand")
+	(match_operand:SF 5 "register_operand"))]
+  "REGNO (operands[0]) == REGNO (operands[4])
+   && REGNO (operands[1]) == REGNO (operands[5])
+   && peep2_reg_dead_p (4, operands[0])
+   && peep2_reg_dead_p (4, operands[1])"
+  [(set (match_dup 2)
+	(match_dup 6))
+   (set (match_dup 3)
+	(match_dup 7))]
+{
+  uint32_t check = 0;
+  int i;
+  for (i = 0; i <= 3; ++i)
+    {
+      uint32_t mask = (uint32_t)1 << REGNO (operands[i]);
+      if (check & mask)
+	FAIL;
+      check |= mask;
+    }
+  operands[6] = gen_rtx_MEM (SFmode, XEXP (operands[6], 0));
+  operands[7] = gen_rtx_MEM (SFmode, XEXP (operands[7], 0));
+})
+
+(define_split
+  [(clobber (match_operand:DSC 0 "register_operand"))]
+  "GP_REG_P (REGNO (operands[0]))"
+  [(const_int 0)]
+{
+  unsigned int regno = REGNO (operands[0]);
+  machine_mode inner_mode = GET_MODE_INNER (<MODE>mode);
+  rtx_insn *insn;
+  rtx x;
+  if (! ((insn = next_nonnote_nondebug_insn (curr_insn))
+	 && NONJUMP_INSN_P (insn)
+	 && GET_CODE (x = PATTERN (insn)) == SET
+	 && REG_P (x = XEXP (x, 0))
+	 && GET_MODE (x) == inner_mode
+	 && REGNO (x) == regno
+	 && (insn = next_nonnote_nondebug_insn (insn))
+	 && NONJUMP_INSN_P (insn)
+	 && GET_CODE (x = PATTERN (insn)) == SET
+	 && REG_P (x = XEXP (x, 0))
+	 && GET_MODE (x) == inner_mode
+	 && REGNO (x) == regno + REG_NREGS (operands[0]) / 2))
+    FAIL;
+})
diff --git a/gcc/config/xtensa/xtensa.opt b/gcc/config/xtensa/xtensa.opt
index c406297..08338e3 100644
--- a/gcc/config/xtensa/xtensa.opt
+++ b/gcc/config/xtensa/xtensa.opt
@@ -27,9 +27,13 @@ Target Mask(FORCE_NO_PIC)
 Disable position-independent code (PIC) for use in OS kernel code.
 
 mlongcalls
-Target
+Target Mask(LONGCALLS)
 Use indirect CALLXn instructions for large programs.
 
+mextra-l32r-costs=
+Target RejectNegative Joined UInteger Var(xtensa_extra_l32r_costs) Init(0)
+Set extra memory access cost for L32R instruction, in clock-cycle units.
+
 mtarget-align
 Target
 Automatically align branch targets to reduce branch penalties.
author	Martin Liska <mliska@suse.cz>	2022-06-17 13:05:50 +0200
committer	Martin Liska <mliska@suse.cz>	2022-06-17 13:05:50 +0200
commit	910ef4ff32f3a53dbd12445e1eb8c5349d047140 (patch)
tree	ef0772e400ad08ab2bef318992cc2c1bf6fb5b7e /gcc/config
parent	cad2e08f6c249937e10ad5ae0d4a117923979efb (diff)
parent	94018fd2675190a4353cb199da4957538f070886 (diff)
download	gcc-910ef4ff32f3a53dbd12445e1eb8c5349d047140.zip gcc-910ef4ff32f3a53dbd12445e1eb8c5349d047140.tar.gz gcc-910ef4ff32f3a53dbd12445e1eb8c5349d047140.tar.bz2