89 files changed, 4157 insertions, 1662 deletions
diff --git a/gcc/config/aarch64/aarch64-cc-fusion.cc b/gcc/config/aarch64/aarch64-cc-fusion.cc
deleted file mode 100644
index cea54de..0000000
--- a/gcc/config/aarch64/aarch64-cc-fusion.cc
+++ /dev/null
@@ -1,297 +0,0 @@
-// Pass to fuse CC operations with other instructions.
-// Copyright (C) 2021-2025 Free Software Foundation, Inc.
-//
-// This file is part of GCC.
-//
-// GCC is free software; you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free
-// Software Foundation; either version 3, or (at your option) any later
-// version.
-//
-// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or
-// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-// for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with GCC; see the file COPYING3.  If not see
-// <http://www.gnu.org/licenses/>.
-
-// This pass looks for sequences of the form:
-//
-//    A: (set (reg R1) X1)
-//    B: ...instructions that might change the value of X1...
-//    C: (set (reg CC) X2) // X2 uses R1
-//
-// and tries to change them to:
-//
-//    C': [(set (reg CC) X2')
-//         (set (reg R1) X1)]
-//    B: ...instructions that might change the value of X1...
-//
-// where X2' is the result of replacing R1 with X1 in X2.
-//
-// This sequence occurs in SVE code in two important cases:
-//
-// (a) Sometimes, to deal correctly with overflow, we need to increment
-//     an IV after a WHILELO rather than before it.  In this case:
-//     - A is a WHILELO,
-//     - B includes an IV increment and
-//     - C is a separate PTEST.
-//
-// (b) ACLE code of the form:
-//
-//       svbool_t ok = svrdffr ();
-//       if (svptest_last (pg, ok))
-//         ...
-//
-//     must, for performance reasons, be code-generated as:
-//
-//       RDFFRS Pok.B, Pg/Z
-//       ...branch on flags result...
-//
-//     without a separate PTEST of Pok.  In this case:
-//     - A is an aarch64_rdffr
-//     - B includes an aarch64_update_ffrt
-//     - C is a separate PTEST
-//
-// Combine can handle this optimization if B doesn't exist and if A and
-// C are in the same BB.  This pass instead handles cases where B does
-// exist and cases where A and C are in different BBs of the same EBB.
-
-#define IN_TARGET_CODE 1
-
-#define INCLUDE_ALGORITHM
-#define INCLUDE_FUNCTIONAL
-#define INCLUDE_ARRAY
-#include "config.h"
-#include "system.h"
-#include "coretypes.h"
-#include "backend.h"
-#include "rtl.h"
-#include "df.h"
-#include "rtl-ssa.h"
-#include "tree-pass.h"
-
-using namespace rtl_ssa;
-
-namespace {
-const pass_data pass_data_cc_fusion =
-{
-  RTL_PASS, // type
-  "cc_fusion", // name
-  OPTGROUP_NONE, // optinfo_flags
-  TV_NONE, // tv_id
-  0, // properties_required
-  0, // properties_provided
-  0, // properties_destroyed
-  0, // todo_flags_start
-  TODO_df_finish, // todo_flags_finish
-};
-
-// Class that represents one run of the pass.
-class cc_fusion
-{
-public:
-  cc_fusion ()  : m_parallel () {}
-  void execute ();
-
-private:
-  rtx optimizable_set (const insn_info *);
-  bool parallelize_insns (def_info *, rtx, def_info *, rtx);
-  void optimize_cc_setter (def_info *, rtx);
-
-  // A spare PARALLEL rtx, or null if none.
-  rtx m_parallel;
-};
-
-// See whether INSN is a single_set that we can optimize.  Return the
-// set if so, otherwise return null.
-rtx
-cc_fusion::optimizable_set (const insn_info *insn)
-{
-  if (!insn->can_be_optimized ()
-      || insn->is_asm ()
-      || insn->has_volatile_refs ()
-      || insn->has_pre_post_modify ())
-    return NULL_RTX;
-
-  return single_set (insn->rtl ());
-}
-
-// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise
-// a single_set that sets (only) OTHER_DEF.  CC_SET is known to set the
-// CC register and the instruction that contains CC_SET is known to use
-// OTHER_DEF.  Try to do CC_SET and OTHER_SET in parallel.
-bool
-cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set,
-			      def_info *other_def, rtx other_set)
-{
-  auto attempt = crtl->ssa->new_change_attempt ();
-
-  insn_info *cc_insn = cc_def->insn ();
-  insn_info *other_insn = other_def->insn ();
-  if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, "trying to parallelize insn %d and insn %d\n",
-	     other_insn->uid (), cc_insn->uid ());
-
-  // Try to substitute OTHER_SET into CC_INSN.
-  insn_change_watermark rtl_watermark;
-  rtx_insn *cc_rtl = cc_insn->rtl ();
-  insn_propagation prop (cc_rtl, SET_DEST (other_set),
-			 SET_SRC (other_set));
-  if (!prop.apply_to_pattern (&PATTERN (cc_rtl))
-      || prop.num_replacements == 0)
-    {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file, "-- failed to substitute all uses of r%d\n",
-		 other_def->regno ());
-      return false;
-    }
-
-  // Restrict the uses to those outside notes.
-  use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ());
-  use_array other_set_uses = remove_note_accesses (attempt,
-						   other_insn->uses ());
-
-  // Remove the use of the substituted value.
-  access_array_builder uses_builder (attempt);
-  uses_builder.reserve (cc_uses.size ());
-  for (use_info *use : cc_uses)
-    if (use->def () != other_def)
-      uses_builder.quick_push (use);
-  cc_uses = use_array (uses_builder.finish ());
-
-  // Get the list of uses for the new instruction.
-  insn_change cc_change (cc_insn);
-  cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses);
-  if (!cc_change.new_uses.is_valid ())
-    {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file, "-- cannot merge uses\n");
-      return false;
-    }
-
-  // The instruction initially defines just two registers.  recog can add
-  // extra clobbers if necessary.
-  auto_vec<access_info *, 2> new_defs;
-  new_defs.quick_push (cc_def);
-  new_defs.quick_push (other_def);
-  sort_accesses (new_defs);
-  cc_change.new_defs = def_array (access_array (new_defs));
-
-  // Make sure there is somewhere that the new instruction could live.
-  auto other_change = insn_change::delete_insn (other_insn);
-  insn_change *changes[] = { &other_change, &cc_change };
-  cc_change.move_range = cc_insn->ebb ()->insn_range ();
-  if (!restrict_movement (cc_change, ignore_changing_insns (changes)))
-    {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file, "-- cannot satisfy all definitions and uses\n");
-      return false;
-    }
-
-  // Tentatively install the new pattern.  By convention, the CC set
-  // must be first.
-  if (m_parallel)
-    {
-      XVECEXP (m_parallel, 0, 0) = cc_set;
-      XVECEXP (m_parallel, 0, 1) = other_set;
-    }
-  else
-    {
-      rtvec vec = gen_rtvec (2, cc_set, other_set);
-      m_parallel = gen_rtx_PARALLEL (VOIDmode, vec);
-    }
-  validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1);
-
-  // These routines report failures themselves.
-  if (!recog (attempt, cc_change, ignore_changing_insns (changes))
-      || !changes_are_worthwhile (changes)
-      || !crtl->ssa->verify_insn_changes (changes))
-    return false;
-
-  remove_reg_equal_equiv_notes (cc_rtl);
-  confirm_change_group ();
-  crtl->ssa->change_insns (changes);
-  m_parallel = NULL_RTX;
-  return true;
-}
-
-// Try to optimize the instruction that contains CC_DEF, where CC_DEF describes
-// a definition of the CC register by CC_SET.
-void
-cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set)
-{
-  // Search the registers used by the CC setter for an easily-substitutable
-  // def-use chain.
-  for (use_info *other_use : cc_def->insn ()->uses ())
-    if (def_info *other_def = other_use->def ())
-      if (other_use->regno () != CC_REGNUM
-	  && other_def->ebb () == cc_def->ebb ())
-	if (rtx other_set = optimizable_set (other_def->insn ()))
-	  {
-	    rtx dest = SET_DEST (other_set);
-	    if (REG_P (dest)
-		&& REGNO (dest) == other_def->regno ()
-		&& REG_NREGS (dest) == 1
-		&& parallelize_insns (cc_def, cc_set, other_def, other_set))
-	      return;
-	  }
-}
-
-// Run the pass on the current function.
-void
-cc_fusion::execute ()
-{
-  // Initialization.
-  calculate_dominance_info (CDI_DOMINATORS);
-  df_analyze ();
-  crtl->ssa = new rtl_ssa::function_info (cfun);
-
-  // Walk through all instructions that set CC.  Look for a PTEST instruction
-  // that we can optimize.
-  //
-  // ??? The PTEST test isn't needed for correctness, but it ensures that the
-  // pass no effect on non-SVE code.
-  for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM))
-    if (rtx cc_set = optimizable_set (def->insn ()))
-      if (REG_P (SET_DEST (cc_set))
-	  && REGNO (SET_DEST (cc_set)) == CC_REGNUM
-	  && GET_CODE (SET_SRC (cc_set)) == UNSPEC
-	  && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST)
-	optimize_cc_setter (def, cc_set);
-
-  // Finalization.
-  crtl->ssa->perform_pending_updates ();
-  free_dominance_info (CDI_DOMINATORS);
-}
-
-class pass_cc_fusion : public rtl_opt_pass
-{
-public:
-  pass_cc_fusion (gcc::context *ctxt)
-    : rtl_opt_pass (pass_data_cc_fusion, ctxt)
-  {}
-
-  // opt_pass methods:
-  virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; }
-  virtual unsigned int execute (function *);
-};
-
-unsigned int
-pass_cc_fusion::execute (function *)
-{
-  cc_fusion ().execute ();
-  return 0;
-}
-
-} // end namespace
-
-// Create a new CC fusion pass instance.
-
-rtl_opt_pass *
-make_pass_cc_fusion (gcc::context *ctxt)
-{
-  return new pass_cc_fusion (ctxt);
-}
diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def
index 9cf9d3e..6a53ff3 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -24,6 +24,5 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation);
 INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_switch_pstate_sm);
 INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_late_track_speculation);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
-INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
 INSERT_PASS_BEFORE (pass_early_remat, 1, pass_ldp_fusion);
 INSERT_PASS_BEFORE (pass_peephole2, 1, pass_ldp_fusion);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 36bd885..56efcf2 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1098,6 +1098,7 @@ bool aarch64_legitimate_address_p (machine_mode, rtx, bool,
 				   aarch64_addr_query_type = ADDR_QUERY_M);
 machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
 rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
+rtx aarch64_gen_compare_split_imm24 (rtx, rtx, rtx);
 bool aarch64_maxmin_plus_const (rtx_code, rtx *, bool);
 rtx aarch64_load_tp (rtx);
 
@@ -1236,7 +1237,6 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *);
 rtl_opt_pass *make_pass_track_speculation (gcc::context *);
 rtl_opt_pass *make_pass_late_track_speculation (gcc::context *);
 rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
-rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
 rtl_opt_pass *make_pass_switch_pstate_sm (gcc::context *ctxt);
 rtl_opt_pass *make_pass_ldp_fusion (gcc::context *);
 
@@ -1281,4 +1281,7 @@ extern bool aarch64_gcs_enabled ();
 extern unsigned aarch64_data_alignment (const_tree exp, unsigned align);
 extern unsigned aarch64_stack_alignment (const_tree exp, unsigned align);
 
+extern rtx aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
+						rtx_code_label *label);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 8b75c3d..c111dc2 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -6731,7 +6731,7 @@
 	(SAT_TRUNC:<VNARROWQ>
 	  (<TRUNC_SHIFT>:SD_HSDI
 	    (match_operand:SD_HSDI 1 "register_operand" "w")
-	    (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))))]
+	    (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))))]
   "TARGET_SIMD"
   "<shrn_op>shrn\t%<vn2>0<Vmntype>, %<v>1<Vmtype>, %2"
   [(set_attr "type" "neon_shift_imm_narrow_q")]
@@ -6753,7 +6753,7 @@
 	(ALL_TRUNC:<VNARROWQ>
 	  (<TRUNC_SHIFT>:VQN
 	    (match_operand:VQN 1 "register_operand")
-	    (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))))]
+	    (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))))]
   "TARGET_SIMD"
   {
     operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
@@ -6784,7 +6784,7 @@
 	      (<TRUNCEXTEND>:<DWI>
 	        (match_operand:SD_HSDI 1 "register_operand" "w"))
 	      (match_operand:<DWI> 3 "aarch64_int_rnd_operand"))
-	    (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))))]
+	    (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))))]
   "TARGET_SIMD
    && aarch64_const_vec_rnd_cst_p (operands[3], operands[2])"
   "<shrn_op>rshrn\t%<vn2>0<Vmntype>, %<v>1<Vmtype>, %2"
@@ -6799,7 +6799,7 @@
 	      (<TRUNCEXTEND>:<V2XWIDE>
 	        (match_operand:SD_HSDI 1 "register_operand"))
 	      (match_dup 3))
-	    (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))))]
+	    (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))))]
   "TARGET_SIMD"
   {
     /* Use this expander to create the rounding constant vector, which is
@@ -6819,7 +6819,7 @@
 	      (<TRUNCEXTEND>:<V2XWIDE>
 	        (match_operand:VQN 1 "register_operand"))
 	      (match_dup 3))
-	    (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))))]
+	    (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))))]
   "TARGET_SIMD"
   {
     if (<CODE> == TRUNCATE
@@ -6861,7 +6861,7 @@
 	  (smax:SD_HSDI
 	    (ashiftrt:SD_HSDI
 	      (match_operand:SD_HSDI 1 "register_operand" "w")
-	      (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))
+	      (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))
 	    (const_int 0))
 	  (const_int <half_mask>)))]
   "TARGET_SIMD"
@@ -6872,7 +6872,7 @@
 (define_expand "aarch64_sqshrun_n<mode>"
   [(match_operand:<VNARROWQ> 0 "register_operand")
    (match_operand:SD_HSDI 1 "register_operand")
-   (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>")]
+   (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")]
   "TARGET_SIMD"
   {
     rtx dst = gen_reg_rtx (<MODE>mode);
@@ -6890,7 +6890,7 @@
 	    (smax:VQN
 	      (ashiftrt:VQN
 		(match_operand:VQN 1 "register_operand")
-		(match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))
+		(match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))
 	      (match_dup 3))
 	    (match_dup 4))))]
   "TARGET_SIMD"
@@ -6932,7 +6932,7 @@
 		(sign_extend:<DWI>
 		  (match_operand:SD_HSDI 1 "register_operand" "w"))
 		(match_operand:<DWI> 3 "aarch64_int_rnd_operand"))
-	      (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))
+	      (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))
 	    (const_int 0))
 	  (const_int <half_mask>)))]
   "TARGET_SIMD
@@ -6944,7 +6944,7 @@
 (define_expand "aarch64_sqrshrun_n<mode>"
   [(match_operand:<VNARROWQ> 0 "register_operand")
    (match_operand:SD_HSDI 1 "register_operand")
-   (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>")]
+   (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")]
   "TARGET_SIMD"
   {
     int prec = GET_MODE_UNIT_PRECISION (<DWI>mode);
@@ -6967,7 +6967,7 @@
 		  (sign_extend:<V2XWIDE>
 		    (match_operand:VQN 1 "register_operand"))
 		  (match_dup 3))
-		(match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))
+		(match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))
 	      (match_dup 4))
 	    (match_dup 5))))]
   "TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md
index 6b1a747..0123ea0 100644
--- a/gcc/config/aarch64/aarch64-sme.md
+++ b/gcc/config/aarch64/aarch64-sme.md
@@ -400,7 +400,8 @@
     auto label = gen_label_rtx ();
     auto tpidr2 = gen_rtx_REG (DImode, R16_REGNUM);
     emit_insn (gen_aarch64_read_tpidr2 (tpidr2));
-    auto jump = emit_likely_jump_insn (gen_aarch64_cbznedi1 (tpidr2, label));
+    auto pat = aarch64_gen_compare_zero_and_branch (NE, tpidr2, label);
+    auto jump = emit_likely_jump_insn (pat);
     JUMP_LABEL (jump) = label;
 
     aarch64_restore_za (operands[0]);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2dbaf4a..ef9c165 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -975,19 +975,24 @@ aarch64_cb_rhs (rtx_code op_code, rtx rhs)
     {
     case EQ:
     case NE:
-    case GT:
-    case GTU:
     case LT:
     case LTU:
+    case GE:
+    case GEU:
+      /* EQ/NE  range is 0 .. 63.
+	 LT/LTU range is 0 .. 63.
+	 GE/GEU range is 1 .. 64 => GT x - 1, but also supports 0 via XZR.
+	 So the intersection is 0 .. 63. */
       return IN_RANGE (rhs_val, 0, 63);
 
-    case GE:  /* CBGE:   signed greater than or equal */
-    case GEU: /* CBHS: unsigned greater than or equal */
-      return IN_RANGE (rhs_val, 1, 64);
-
-    case LE:  /* CBLE:   signed less than or equal */
-    case LEU: /* CBLS: unsigned less than or equal */
-      return IN_RANGE (rhs_val, -1, 62);
+    case GT:
+    case GTU:
+    case LE:
+    case LEU:
+      /* GT/GTU range is  0 .. 63
+	 LE/LEU range is -1 .. 62 => LT x + 1.
+	 So the intersection is 0 .. 62. */
+      return IN_RANGE (rhs_val, 0, 62);
 
     default:
       return false;
@@ -2882,10 +2887,47 @@ aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
   return aarch64_gen_compare_reg (code, x, y);
 }
 
+/* Split IMM into two 12-bit halves, producing an EQ/NE comparison vs X.
+   TMP may be a scratch.  This optimizes a sequence from
+	mov	x0, #imm1
+	movk	x0, #imm2, lsl 16  -- x0 contains CST
+	cmp	x1, x0
+   into the shorter:
+	sub	tmp, x1, #(CST & 0xfff000)
+	subs	tmp, tmp, #(CST & 0x000fff)
+*/
+rtx
+aarch64_gen_compare_split_imm24 (rtx x, rtx imm, rtx tmp)
+{
+  HOST_WIDE_INT lo_imm = UINTVAL (imm) & 0xfff;
+  HOST_WIDE_INT hi_imm = UINTVAL (imm) & 0xfff000;
+  enum machine_mode mode = GET_MODE (x);
+
+  if (GET_CODE (tmp) == SCRATCH)
+    tmp = gen_reg_rtx (mode);
+
+  emit_insn (gen_add3_insn (tmp, x, GEN_INT (-hi_imm)));
+  /* TODO: We don't need the gpr result of the second insn. */
+  switch (mode)
+    {
+    case SImode:
+      tmp = gen_addsi3_compare0 (tmp, tmp, GEN_INT (-lo_imm));
+      break;
+    case DImode:
+      tmp = gen_adddi3_compare0 (tmp, tmp, GEN_INT (-lo_imm));
+      break;
+    default:
+      abort ();
+    }
+  emit_insn (tmp);
+
+  return gen_rtx_REG (CC_NZmode, CC_REGNUM);
+}
+
 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
    Return the jump instruction.  */
 
-static rtx
+rtx
 aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
 				     rtx_code_label *label)
 {
@@ -14380,41 +14422,57 @@ aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
     {
       /* Conditional branch.  */
-      if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
+      enum machine_mode cmpmode = GET_MODE (inner);
+      if (GET_MODE_CLASS (cmpmode) == MODE_CC)
 	return true;
-      else
+
+      if (comparator == const0_rtx)
 	{
-	  if (cmpcode == NE || cmpcode == EQ)
+	  switch (cmpcode)
 	    {
-	      if (comparator == const0_rtx)
-		{
-		  /* TBZ/TBNZ/CBZ/CBNZ.  */
-		  if (GET_CODE (inner) == ZERO_EXTRACT)
-		    /* TBZ/TBNZ.  */
-		    *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
-				       ZERO_EXTRACT, 0, speed);
-		  else
-		    /* CBZ/CBNZ.  */
-		    *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
-
-		  return true;
-		}
-	      if (register_operand (inner, VOIDmode)
-		  && aarch64_imm24 (comparator, VOIDmode))
+	    case NE:
+	    case EQ:
+	      if (cmpmode != SImode && cmpmode != DImode)
+		break;
+	      if (GET_CODE (inner) == ZERO_EXTRACT)
 		{
-		  /* SUB and SUBS.  */
-		  *cost += COSTS_N_INSNS (2);
-		  if (speed)
-		    *cost += extra_cost->alu.arith * 2;
+		  /* TBZ/TBNZ.  */
+		  *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
+				     ZERO_EXTRACT, 0, speed);
 		  return true;
 		}
+	      /* FALLTHRU */
+
+	    case LT:
+	    case GE:
+	      /* CBZ/CBNZ/TBZ/TBNZ.  */
+	      *cost += rtx_cost (inner, cmpmode, cmpcode, 0, speed);
+	      return true;
+
+	    default:
+	      break;
 	    }
-	  else if (cmpcode == LT || cmpcode == GE)
-	    {
-	      /* TBZ/TBNZ.  */
-	      if (comparator == const0_rtx)
-		return true;
-	    }
+	}
+
+      if ((cmpcode == NE || cmpcode == EQ)
+	  && (cmpmode == SImode || cmpmode == DImode)
+	  && aarch64_split_imm24 (comparator, cmpmode))
+	{
+	  /* SUB and SUBS.  */
+	  *cost += rtx_cost (inner, cmpmode, cmpcode, 0, speed);
+	  *cost += COSTS_N_INSNS (2);
+	  if (speed)
+	    *cost += extra_cost->alu.arith * 2;
+	  return true;
+	}
+
+      if (TARGET_CMPBR)
+	{
+	  *cost += rtx_cost (inner, cmpmode, cmpcode, 0, speed);
+	  if ((cmpmode != SImode && cmpmode != DImode)
+	      || !aarch64_cb_rhs (cmpcode, comparator))
+	    *cost += rtx_cost (comparator, cmpmode, cmpcode, 1, speed);
+	  return true;
 	}
     }
   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
@@ -16999,6 +17057,14 @@ private:
      or vector loop.  There is one entry for each tuning option of
      interest.  */
   auto_vec<aarch64_vec_op_count, 2> m_ops;
+
+  /* When doing inner-loop vectorization the constraints on the data-refs in the
+     outer-loop could limit the inner loop references.  i.e. the outerloop can
+     force the inner-loop to do a load and splat which will result in the loop
+     being entirely scalar as all lanes work on a duplicate.  Currently we don't
+     support unrolling of the inner loop independently from the outerloop during
+     outer-loop vectorization which tends to lead to pipeline bubbles.  */
+  bool m_loop_fully_scalar_dup = false;
 };
 
 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
@@ -17320,13 +17386,14 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
 
 static bool
 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
-			 unsigned int vec_flags)
+			 slp_tree node, unsigned int vec_flags)
 {
   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
   if (!assign
+      || !node
       || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
-      || !STMT_VINFO_VECTYPE (stmt_info)
-      || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
+      || !SLP_TREE_VECTYPE (node)
+      || !VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
     return false;
 
   for (int i = 1; i < 3; ++i)
@@ -17361,10 +17428,11 @@ aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
    instructions.  */
 static unsigned int
 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
+				       slp_tree node,
 				       stmt_vec_info stmt_info,
 				       const sve_vec_cost *sve_costs)
 {
-  switch (vect_reduc_type (vinfo, stmt_info))
+  switch (vect_reduc_type (vinfo, node))
     {
     case EXTRACT_LAST_REDUCTION:
       return sve_costs->clast_cost;
@@ -17404,7 +17472,9 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
      SVE implementation.  */
 static unsigned int
-aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
+aarch64_in_loop_reduction_latency (vec_info *vinfo,
+				   slp_tree node,
+				   stmt_vec_info stmt_info,
 				   unsigned int vec_flags)
 {
   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
@@ -17417,7 +17487,8 @@ aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
   if (sve_costs)
     {
       unsigned int latency
-	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
+	= aarch64_sve_in_loop_reduction_latency (vinfo, node,
+						 stmt_info, sve_costs);
       if (latency)
 	return latency;
     }
@@ -17493,7 +17564,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
   if (kind == scalar_load
       && node
       && sve_costs
-      && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
+      && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
     {
       unsigned int nunits = vect_nunits_for_cost (vectype);
       /* Test for VNx2 modes, which have 64-bit containers.  */
@@ -17507,7 +17578,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
   if (kind == scalar_store
       && node
       && sve_costs
-      && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
+      && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
     return sve_costs->scatter_store_elt_cost;
 
   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
@@ -17516,7 +17587,8 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
       && sve_costs)
     {
       unsigned int latency
-	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
+	= aarch64_sve_in_loop_reduction_latency (vinfo, node,
+						 stmt_info, sve_costs);
       if (latency)
 	return latency;
     }
@@ -17665,7 +17737,7 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
 
 	  /* For vector boolean ANDs with a compare operand we just need
 	     one insn.  */
-	  if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
+	  if (aarch64_bool_compound_p (vinfo, stmt_info, node, vec_flags))
 	    return 0;
 	}
 
@@ -17698,13 +17770,12 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
 
    with the single accumulator being read and written multiple times.  */
 static bool
-aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
+aarch64_force_single_cycle (vec_info *vinfo, slp_tree node)
 {
-  if (!STMT_VINFO_REDUC_DEF (stmt_info))
+  auto reduc_info = info_for_reduction (as_a <loop_vec_info> (vinfo), node);
+  if (!reduc_info)
     return false;
-
-  auto reduc_info = info_for_reduction (vinfo, stmt_info);
-  return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
+  return VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
 }
 
 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
@@ -17728,8 +17799,10 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
       && vect_is_reduction (stmt_info))
     {
       unsigned int base
-	= aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
-      if (aarch64_force_single_cycle (m_vinfo, stmt_info))
+	= aarch64_in_loop_reduction_latency (m_vinfo, node,
+					     stmt_info, m_vec_flags);
+      if (m_costing_for_scalar
+	  || aarch64_force_single_cycle (m_vinfo, node))
 	/* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
 	   and then accumulate that, but at the moment the loop-carried
 	   dependency includes all copies.  */
@@ -17746,7 +17819,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
 
       /* Assume that bool AND with compare operands will become a single
 	 operation.  */
-      if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
+      if (aarch64_bool_compound_p (m_vinfo, stmt_info, node, m_vec_flags))
 	return;
     }
 
@@ -17763,7 +17836,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
   if (stmt_info
       && kind == vec_to_scalar
       && (m_vec_flags & VEC_ADVSIMD)
-      && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
+      && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
     {
       auto dr = STMT_VINFO_DATA_REF (stmt_info);
       tree dr_ref = DR_REF (dr);
@@ -17842,7 +17915,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
      have only accounted for one.  */
   if (stmt_info
       && (kind == vector_stmt || kind == vec_to_scalar)
-      && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
+      && vect_reduc_type (m_vinfo, node) == COND_REDUCTION)
     ops->general_ops += count;
 
   /* Count the predicate operations needed by an SVE comparison.  */
@@ -17878,7 +17951,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
   if (stmt_info
       && sve_issue
       && (kind == scalar_load || kind == scalar_store)
-      && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
+      && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
     {
       unsigned int pairs = CEIL (count, 2);
       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
@@ -17987,6 +18060,17 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 				     tree vectype, int misalign,
 				     vect_cost_model_location where)
 {
+  /* When costing for scalars, vectype will be NULL; so look up the type via
+     stmt_info's statement.  */
+  if (m_costing_for_scalar && stmt_info)
+    {
+      gcc_assert (!vectype);
+      /* This won't work for e.g. gconds or other statements without a lhs,
+	 but those only work on GPR anyway and this is the best we can do.  */
+      if (tree lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info)))
+	vectype = TREE_TYPE (lhs);
+    }
+
   fractional_cost stmt_cost
     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
 
@@ -18002,6 +18086,28 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	analyze_loop_vinfo (loop_vinfo);
 
       m_analyzed_vinfo = true;
+      if (in_inner_loop_p)
+	m_loop_fully_scalar_dup = true;
+    }
+
+  /* Detect whether the loop is working on fully duplicated lanes.  This would
+     only be possible with inner loop vectorization since otherwise we wouldn't
+     try to vectorize.  */
+  if (in_inner_loop_p
+      && node
+      && m_loop_fully_scalar_dup
+      && SLP_TREE_LANES (node) == 1
+      && !SLP_TREE_CHILDREN (node).exists ())
+    {
+      /* Check if load is a duplicate.  */
+      if (gimple_vuse (stmt_info->stmt)
+	  && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_INVARIANT)
+	;
+      else if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
+	       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
+	;
+      else
+	m_loop_fully_scalar_dup = false;
     }
 
   /* Apply the heuristic described above m_stp_sequence_cost.  */
@@ -18036,7 +18142,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	  && node
 	  && vectype
 	  && aarch64_sve_mode_p (TYPE_MODE (vectype))
-	  && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
+	  && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
 	{
 	  const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
 	  if (sve_costs)
@@ -18368,8 +18474,19 @@ adjust_body_cost (loop_vec_info loop_vinfo,
   if (m_vec_flags & VEC_ANY_SVE)
     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
 
-  if (m_num_vector_iterations >= 1
-      && m_num_vector_iterations < threshold)
+  /* Increase the cost of the vector code if it looks like the vector code has
+     limited throughput due to outer-loop vectorization.  */
+  if (m_loop_fully_scalar_dup)
+    {
+      body_cost *= estimated_vf;
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Increasing body cost to %d because vector code has"
+			 " low throughput of per iteration due to splats\n",
+			 body_cost);
+    }
+  else if (m_num_vector_iterations >= 1
+	   && m_num_vector_iterations < threshold)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
@@ -31808,7 +31925,7 @@ aarch64_expand_reversed_crc_using_pmull (scalar_mode crc_mode,
 
 /* Expand the spaceship optab for floating-point operands.
 
-   If the result is compared against (-1, 0, 1 , 2), expand into
+   If the result is compared against (-1, 0, 1, -128), expand into
    fcmpe + conditional branch insns.
 
    Otherwise (the result is just stored as an integer), expand into
@@ -31847,7 +31964,7 @@ aarch64_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx hint)
       emit_jump (end_label);
 
       emit_label (un_label);
-      emit_move_insn (dest, const2_rtx);
+      emit_move_insn (dest, GEN_INT (-128));
       emit_jump (end_label);
 
       emit_label (gt_label);
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 096c853..2b3610c 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -410,8 +410,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
 /* CSSC instructions are enabled through +cssc.  */
 #define TARGET_CSSC AARCH64_HAVE_ISA (CSSC)
 
-/* CB<cc> instructions are enabled through +cmpbr.  */
-#define TARGET_CMPBR AARCH64_HAVE_ISA (CMPBR)
+/* CB<cc> instructions are enabled through +cmpbr,
+   but are incompatible with -mtrack-speculation. */
+#define TARGET_CMPBR (AARCH64_HAVE_ISA (CMPBR) && !aarch64_track_speculation)
 
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index dc2be81..6e215c4 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -441,6 +441,16 @@
    ; must not operate on inactive inputs if doing so could induce a fault.
    (SVE_STRICT_GP 1)])
 
+;; These constants are used as a const_int in MTE instructions
+(define_constants
+  [; 0xf0ff...
+   ; Tag mask for the 4-bit tag stored in the top 8 bits of a pointer.
+   (MEMTAG_TAG_MASK -1080863910568919041)
+
+   ;  0x00ff...
+   ; Tag mask 56-bit address used by subp instruction.
+   (MEMTAG_ADDR_MASK 72057594037927935)])
+
 (include "constraints.md")
 (include "predicates.md")
 (include "iterators.md")
@@ -725,8 +735,8 @@
     (BRANCH_LEN_N_32KiB -32768)
 
     ;; +/- 1KiB.  Used by CBB<cond>, CBH<cond>, CB<cond>.
-    (BRANCH_LEN_P_1Kib  1020)
-    (BRANCH_LEN_N_1Kib -1024)
+    (BRANCH_LEN_P_1KiB  1020)
+    (BRANCH_LEN_N_1KiB -1024)
   ]
 )
 
@@ -804,7 +814,7 @@
 )
 
 ;; For an EQ/NE comparison against zero, emit `CBZ`/`CBNZ`
-(define_insn "aarch64_cbz<optab><mode>1"
+(define_insn "*aarch64_cbz<optab><mode>"
   [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
 				(const_int 0))
 			   (label_ref (match_operand 1))
@@ -838,27 +848,13 @@
   [(set (pc) (if_then_else (LTGE (match_operand:ALLI 0 "register_operand" "r")
 				 (const_int 0))
 			   (label_ref (match_operand 1))
-			   (pc)))
-   (clobber (reg:CC CC_REGNUM))]
+			   (pc)))]
   "!aarch64_track_speculation"
   {
-    if (get_attr_length (insn) == 8)
-      {
-	if (get_attr_far_branch (insn) == FAR_BRANCH_YES)
-	  return aarch64_gen_far_branch (operands, 1, "Ltb",
-					 "<inv_tb>\\t%<w>0, <sizem1>, ");
-	else
-	  {
-	    char buf[64];
-	    uint64_t val = ((uint64_t) 1)
-		<< (GET_MODE_SIZE (<MODE>mode) * BITS_PER_UNIT - 1);
-	    sprintf (buf, "tst\t%%<w>0, %" PRId64, val);
-	    output_asm_insn (buf, operands);
-	    return "<bcond>\t%l1";
-	  }
-      }
-    else
+    if (get_attr_length (insn) == 4)
       return "<tbz>\t%<w>0, <sizem1>, %l1";
+    return aarch64_gen_far_branch (operands, 1, "Ltb",
+				   "<inv_tb>\\t%<w>0, <sizem1>, ");
   }
   [(set_attr "type" "branch")
    (set (attr "length")
@@ -870,44 +866,44 @@
 		      (const_int 8)))
    (set (attr "far_branch")
 	(if_then_else (and (ge (minus (match_dup 1) (pc))
-			       (const_int BRANCH_LEN_N_1MiB))
+			       (const_int BRANCH_LEN_N_32KiB))
 			   (lt (minus (match_dup 1) (pc))
-			       (const_int BRANCH_LEN_P_1MiB)))
+			       (const_int BRANCH_LEN_P_32KiB)))
 		      (const_string "no")
 		      (const_string "yes")))]
 )
 
 ;; Emit a `CB<cond> (register)` or `CB<cond> (immediate)` instruction.
 ;; The immediate range depends on the comparison code.
-;; Comparisons against immediates outside this range fall back to
-;; CMP + B<cond>.
-(define_insn "aarch64_cb<INT_CMP:code><GPI:mode>"
-  [(set (pc) (if_then_else (INT_CMP
-			     (match_operand:GPI 0 "register_operand" "r")
-			     (match_operand:GPI 1 "nonmemory_operand"
-			       "r<INT_CMP:cmpbr_imm_constraint>"))
-			   (label_ref (match_operand 2))
-			   (pc)))]
-  "TARGET_CMPBR && aarch64_cb_rhs (<INT_CMP:CODE>, operands[1])"
+(define_insn "*aarch64_cb<code><mode>"
+  [(set (pc) (if_then_else
+		(INT_CMP
+		  (match_operand:GPI 0 "register_operand" "r")
+		  (match_operand:GPI 1
+		    "aarch64_reg_<cmpbr_imm_constraint>_operand"
+		    "r<cmpbr_imm_constraint>"))
+		(label_ref (match_operand 2))
+		(pc)))]
+  "TARGET_CMPBR"
   {
-    return (get_attr_far_branch (insn) == FAR_BRANCH_NO)
-      ? "cb<INT_CMP:cmp_op>\\t%<w>0, %<w>1, %l2"
-      : aarch64_gen_far_branch (operands, 2, "L",
-          "cb<INT_CMP:inv_cmp_op>\\t%<w>0, %<w>1, ");
+    if (get_attr_length (insn) == 4)
+      return "cb<cmp_op>\t%<w>0, %<w>1, %l2";
+    return aarch64_gen_far_branch (operands, 2, "L",
+		"cb<inv_cmp_op>\t%<w>0, %<w>1, ");
   }
   [(set_attr "type" "branch")
    (set (attr "length")
 	(if_then_else (and (ge (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_N_1Kib))
+			       (const_int BRANCH_LEN_N_1KiB))
 			   (lt (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_P_1Kib)))
+			       (const_int BRANCH_LEN_P_1KiB)))
 		      (const_int 4)
 		      (const_int 8)))
    (set (attr "far_branch")
 	(if_then_else (and (ge (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_N_1Kib))
+			       (const_int BRANCH_LEN_N_1KiB))
 			   (lt (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_P_1Kib)))
+			       (const_int BRANCH_LEN_P_1KiB)))
 		      (const_string "no")
 		      (const_string "yes")))]
 )
@@ -929,16 +925,16 @@
   [(set_attr "type" "branch")
    (set (attr "length")
 	(if_then_else (and (ge (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_N_1Kib))
+			       (const_int BRANCH_LEN_N_1KiB))
 			   (lt (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_P_1Kib)))
+			       (const_int BRANCH_LEN_P_1KiB)))
 		      (const_int 4)
 		      (const_int 8)))
    (set (attr "far_branch")
 	(if_then_else (and (ge (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_N_1Kib))
+			       (const_int BRANCH_LEN_N_1KiB))
 			   (lt (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_P_1Kib)))
+			       (const_int BRANCH_LEN_P_1KiB)))
 		      (const_string "no")
 		      (const_string "yes")))]
 )
@@ -978,37 +974,24 @@
 		      (const_string "yes")))]
 )
 
-;; For a 24-bit immediate CST we can optimize the compare for equality
-;; and branch sequence from:
-;; 	mov	x0, #imm1
-;; 	movk	x0, #imm2, lsl 16 /* x0 contains CST.  */
-;; 	cmp	x1, x0
-;; 	b<ne,eq> .Label
-;; into the shorter:
-;; 	sub	x0, x1, #(CST & 0xfff000)
-;; 	subs	x0, x0, #(CST & 0x000fff)
-;; 	b<ne,eq> .Label
+;; For a 24-bit immediate CST we can optimize the compare for equality.
 (define_insn_and_split "*aarch64_bcond_wide_imm<GPI:mode>"
-  [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
-			        (match_operand:GPI 1 "aarch64_imm24" "n"))
-			   (label_ref:P (match_operand 2))
-			   (pc)))]
-  "!aarch64_move_imm (INTVAL (operands[1]), <GPI:MODE>mode)
-   && !aarch64_plus_operand (operands[1], <GPI:MODE>mode)
-   && !reload_completed"
+  [(set (pc) (if_then_else
+	       (match_operator 0 "aarch64_equality_operator"
+		[(match_operand:GPI 1 "register_operand" "r")
+	         (match_operand:GPI 2 "aarch64_split_imm24" "n")])
+	       (label_ref (match_operand 3))
+	       (pc)))
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (match_scratch:GPI 4 "=r"))]
+  ""
   "#"
-  "&& true"
+  ""
   [(const_int 0)]
   {
-    HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff;
-    HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000;
-    rtx tmp = gen_reg_rtx (<GPI:MODE>mode);
-    emit_insn (gen_add<GPI:mode>3 (tmp, operands[0], GEN_INT (-hi_imm)));
-    emit_insn (gen_add<GPI:mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
-    rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
-    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <GPI:MODE>mode,
-				  cc_reg, const0_rtx);
-    emit_jump_insn (gen_aarch64_bcond (cmp_rtx, cc_reg, operands[2]));
+    rtx cc_reg = aarch64_gen_compare_split_imm24 (operands[1], operands[2],
+						  operands[4]);
+    emit_jump_insn (gen_aarch64_bcond (operands[0], cc_reg, operands[3]));
     DONE;
   }
 )
@@ -1413,16 +1396,16 @@
       /* Save GCS with code like
 		mov     x16, 1
 		chkfeat x16
-		tbnz    x16, 0, .L_done
+		cbnz    x16, .L_done
 		mrs     tmp, gcspr_el0
 		str     tmp, [%0, 8]
 	.L_done:  */
 
-      rtx done_label = gen_label_rtx ();
+      auto done_label = gen_label_rtx ();
       rtx r16 = gen_rtx_REG (DImode, R16_REGNUM);
       emit_move_insn (r16, const1_rtx);
       emit_insn (gen_aarch64_chkfeat ());
-      emit_insn (gen_tbranch_neqi3 (r16, const0_rtx, done_label));
+      emit_jump_insn (aarch64_gen_compare_zero_and_branch (NE, r16, done_label));
       rtx gcs_slot = adjust_address (operands[0], Pmode, GET_MODE_SIZE (Pmode));
       rtx gcs = gen_reg_rtx (Pmode);
       emit_insn (gen_aarch64_load_gcspr (gcs));
@@ -1445,7 +1428,7 @@
       /* Restore GCS with code like
 		mov     x16, 1
 		chkfeat x16
-		tbnz    x16, 0, .L_done
+		cbnz    x16, .L_done
 		ldr     tmp1, [%1, 8]
 		mrs     tmp2, gcspr_el0
 		subs    tmp2, tmp1, tmp2
@@ -1456,12 +1439,12 @@
 		b.ne    .L_loop
 	.L_done:  */
 
-      rtx loop_label = gen_label_rtx ();
-      rtx done_label = gen_label_rtx ();
+      auto loop_label = gen_label_rtx ();
+      auto done_label = gen_label_rtx ();
       rtx r16 = gen_rtx_REG (DImode, R16_REGNUM);
       emit_move_insn (r16, const1_rtx);
       emit_insn (gen_aarch64_chkfeat ());
-      emit_insn (gen_tbranch_neqi3 (r16, const0_rtx, done_label));
+      emit_jump_insn (aarch64_gen_compare_zero_and_branch (NE, r16, done_label));
       rtx gcs_slot = adjust_address (operands[1], Pmode, GET_MODE_SIZE (Pmode));
       rtx gcs_old = gen_reg_rtx (Pmode);
       emit_move_insn (gcs_old, gcs_slot);
@@ -4524,7 +4507,7 @@
   [(set_attr "type" "fcmp<stype>")]
 )
 
-(define_insn "*cmp_swp_<shift>_reg<mode>"
+(define_insn "cmp_swp_<shift>_reg<mode>"
   [(set (reg:CC_SWP CC_REGNUM)
 	(compare:CC_SWP (ASHIFT:GPI
 			 (match_operand:GPI 0 "register_operand" "r")
@@ -4651,39 +4634,24 @@
   [(set_attr "type" "csel")]
 )
 
-;; For a 24-bit immediate CST we can optimize the compare for equality
-;; and branch sequence from:
-;; 	mov	x0, #imm1
-;; 	movk	x0, #imm2, lsl 16 /* x0 contains CST.  */
-;; 	cmp	x1, x0
-;; 	cset	x2, <ne,eq>
-;; into the shorter:
-;; 	sub	x0, x1, #(CST & 0xfff000)
-;; 	subs	x0, x0, #(CST & 0x000fff)
-;; 	cset x2, <ne, eq>.
+;; For a 24-bit immediate CST we can optimize the compare for equality.
 (define_insn_and_split "*compare_cstore<mode>_insn"
   [(set (match_operand:GPI 0 "register_operand" "=r")
-	 (EQL:GPI (match_operand:GPI 1 "register_operand" "r")
-		  (match_operand:GPI 2 "aarch64_imm24" "n")))
-   (clobber (reg:CC CC_REGNUM))]
-  "!aarch64_move_imm (INTVAL (operands[2]), <MODE>mode)
-   && !aarch64_plus_operand (operands[2], <MODE>mode)
-   && !reload_completed"
+	(match_operator:GPI 1 "aarch64_equality_operator"
+	 [(match_operand:GPI 2 "register_operand" "r")
+	  (match_operand:GPI 3 "aarch64_split_imm24" "n")]))
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (match_scratch:GPI 4 "=r"))]
+  ""
   "#"
-  "&& true"
+  ""
   [(const_int 0)]
   {
-    HOST_WIDE_INT lo_imm = UINTVAL (operands[2]) & 0xfff;
-    HOST_WIDE_INT hi_imm = UINTVAL (operands[2]) & 0xfff000;
-    rtx tmp = gen_reg_rtx (<MODE>mode);
-    emit_insn (gen_add<mode>3 (tmp, operands[1], GEN_INT (-hi_imm)));
-    emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
-    rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
-    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx);
-    emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp_rtx, cc_reg));
+    rtx cc_reg = aarch64_gen_compare_split_imm24 (operands[2], operands[3],
+						  operands[4]);
+    emit_insn (gen_aarch64_cstore<mode> (operands[0], operands[1], cc_reg));
     DONE;
   }
-  [(set_attr "type" "csel")]
 )
 
 ;; zero_extend version of the above
@@ -4813,15 +4781,21 @@
 			   (match_operand:ALLI 3 "register_operand")))]
   ""
   {
-    rtx ccreg;
     enum rtx_code code = GET_CODE (operands[1]);
-
     if (code == UNEQ || code == LTGT)
       FAIL;
 
-    ccreg = aarch64_gen_compare_reg (code, XEXP (operands[1], 0),
-				     XEXP (operands[1], 1));
-    operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx);
+    rtx ccreg = XEXP (operands[1], 0);
+    enum machine_mode ccmode = GET_MODE (ccreg);
+    if (GET_MODE_CLASS (ccmode) == MODE_CC)
+      gcc_assert (XEXP (operands[1], 1) == const0_rtx);
+    else if (ccmode == QImode || ccmode == HImode)
+      FAIL;
+    else
+      {
+	ccreg = aarch64_gen_compare_reg (code, ccreg, XEXP (operands[1], 1));
+	operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx);
+      }
   }
 )
 
@@ -7716,6 +7690,22 @@
 }
 )
 
+(define_expand "isinf<mode>2"
+ [(match_operand:SI 0 "register_operand")
+  (match_operand:GPF 1 "register_operand")]
+ "TARGET_FLOAT"
+{
+  rtx op = force_lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
+  rtx tmp = gen_reg_rtx (<V_INT_EQUIV>mode);
+  emit_move_insn (tmp, GEN_INT (HOST_WIDE_INT_M1U << (<mantissa_bits> + 1)));
+  rtx cc_reg = gen_rtx_REG (CC_SWPmode, CC_REGNUM);
+  emit_insn (gen_cmp_swp_lsl_reg<v_int_equiv> (op, GEN_INT (1), tmp));
+  rtx cmp = gen_rtx_fmt_ee (EQ, SImode, cc_reg, const0_rtx);
+  emit_insn (gen_aarch64_cstoresi (operands[0], cmp, cc_reg));
+  DONE;
+}
+)
+
 ;; -------------------------------------------------------------------
 ;; Reload support
 ;; -------------------------------------------------------------------
@@ -8566,7 +8556,7 @@
   [(set (match_operand:DI 0 "register_operand" "=rk")
 	(ior:DI
 	 (and:DI (match_operand:DI 1 "register_operand" "rk")
-		 (const_int -1080863910568919041)) ;; 0xf0ff...
+		 (const_int MEMTAG_TAG_MASK))
 	 (ashift:DI (unspec:QI [(match_operand:DI 2 "register_operand" "r")]
 		     UNSPEC_GEN_TAG_RND)
 		    (const_int 56))))]
@@ -8609,9 +8599,9 @@
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(minus:DI
 	  (and:DI (match_operand:DI 1 "register_operand" "rk")
-		  (const_int 72057594037927935)) ;; 0x00ff...
+		  (const_int MEMTAG_ADDR_MASK))
 	  (and:DI (match_operand:DI 2 "register_operand" "rk")
-		  (const_int 72057594037927935))))] ;; 0x00ff...
+		  (const_int MEMTAG_ADDR_MASK))))]
   "TARGET_MEMTAG"
   "subp\\t%0, %1, %2"
   [(set_attr "type" "memtag")]
@@ -8621,7 +8611,7 @@
 (define_insn "ldg"
   [(set (match_operand:DI 0 "register_operand" "+r")
 	(ior:DI
-	 (and:DI (match_dup 0) (const_int -1080863910568919041)) ;; 0xf0ff...
+	 (and:DI (match_dup 0) (const_int MEMTAG_TAG_MASK))
 	 (ashift:DI
 	  (mem:QI (unspec:DI
 	   [(and:DI (plus:DI (match_operand:DI 1 "register_operand" "rk")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index dc1925d..7b9e558 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -312,15 +312,9 @@
 
 (define_constraint "Uc1"
   "@internal
-  A constraint that matches the integers 1...64."
+  A constraint that matches the integers 0...62."
   (and (match_code "const_int")
-       (match_test "IN_RANGE (ival, 1, 64)")))
-
-(define_constraint "Uc2"
-  "@internal
-  A constraint that matches the integers -1...62."
-  (and (match_code "const_int")
-       (match_test "IN_RANGE (ival, -1, 62)")))
+       (match_test "IN_RANGE (ival, 0, 62)")))
 
 (define_constraint "Up3"
   "@internal
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 68b080d..7a6ea0d 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1340,6 +1340,8 @@
 
 (define_mode_attr half_mask [(HI "255") (SI "65535") (DI "4294967295")])
 
+(define_mode_attr mantissa_bits [(SF "23") (DF "52")])
+
 ;; For constraints used in scalar immediate vector moves
 (define_mode_attr hq [(HI "h") (QI "q")])
 
@@ -2203,7 +2205,8 @@
 			   (SI   "si")])
 
 ;; Like ve_mode but for the half-width modes.
-(define_mode_attr vn_mode [(V8HI  "qi") (V4SI  "hi") (V2DI  "si")])
+(define_mode_attr vn_mode [(V8HI  "qi") (V4SI  "hi") (V2DI  "si") (DI "si")
+			   (SI "hi") (HI "qi")])
 
 ;; Vm for lane instructions is restricted to FP_LO_REGS.
 (define_mode_attr vwx [(V4HI "x") (V8HI "x") (HI "x")
@@ -2986,19 +2989,15 @@
 
 (define_code_iterator INT_CMP [lt le eq ne ge gt ltu leu geu gtu])
 
+;; Inverse comparisons must have the same constraint so that
+;; branches can be redirected during late compilation.
 (define_code_attr cmpbr_imm_constraint [
-    (eq "Uc0")
-    (ne "Uc0")
-    (gt "Uc0")
-    (gtu "Uc0")
-    (lt "Uc0")
-    (ltu "Uc0")
-
-    (ge "Uc1")
-    (geu "Uc1")
-
-    (le "Uc2")
-    (leu "Uc2")
+    (eq "Uc0") (ne "Uc0")
+    (lt "Uc0") (ge "Uc0")
+    (ltu "Uc0") (geu "Uc0")
+
+    (gt "Uc1") (le "Uc1")
+    (gtu "Uc1") (leu "Uc1")
 ])
 
 (define_code_attr fix_trunc_optab [(fix "fix_trunc")
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 4d5d57f..42304ce 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -286,10 +286,15 @@
   (and (match_code "const_int")
        (match_test "UINTVAL (op) <= 7")))
 
-;; An immediate that fits into 24 bits.
-(define_predicate "aarch64_imm24"
-  (and (match_code "const_int")
-       (match_test "IN_RANGE (UINTVAL (op), 0, 0xffffff)")))
+;; An immediate that fits into 24 bits, but needs splitting.
+(define_predicate "aarch64_split_imm24"
+  (match_code "const_int")
+{
+  unsigned HOST_WIDE_INT i = UINTVAL (op);
+  return (IN_RANGE (i, 0, 0xffffff)
+          && !aarch64_move_imm (i, mode)
+          && !aarch64_uimm12_shift (i));
+})
 
 (define_predicate "aarch64_mem_pair_offset"
   (and (match_code "const_int")
@@ -1084,3 +1089,13 @@
 (define_special_predicate "aarch64_ptrue_all_operand"
   (and (match_code "const_vector")
        (match_test "aarch64_ptrue_all_mode (op) == mode")))
+
+(define_predicate "aarch64_reg_Uc0_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "const_int")
+	    (match_test "satisfies_constraint_Uc0 (op)"))))
+
+(define_predicate "aarch64_reg_Uc1_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "const_int")
+	    (match_test "satisfies_constraint_Uc1 (op)"))))
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index 38a8c06..63ca8e9 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -190,12 +190,6 @@ aarch-bti-insert.o: $(srcdir)/config/arm/aarch-bti-insert.cc \
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/arm/aarch-bti-insert.cc
 
-aarch64-cc-fusion.o: $(srcdir)/config/aarch64/aarch64-cc-fusion.cc \
-    $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \
-    $(RTL_SSA_H) tree-pass.h
-	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
-		$(srcdir)/config/aarch64/aarch64-cc-fusion.cc
-
 aarch64-early-ra.o: $(srcdir)/config/aarch64/aarch64-early-ra.cc \
     $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \
     $(RTL_SSA_H) tree-pass.h
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index d119464..8f7e537 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -66,9 +66,9 @@
 ;;   I signed 12-bit immediate (for ARCompact)
 ;;   K  unsigned 3-bit immediate (for ARCompact)
 ;;   L  unsigned 6-bit immediate (for ARCompact)
-;;   M  unsinged 5-bit immediate (for ARCompact)
-;;   O  unsinged 7-bit immediate (for ARCompact)
-;;   P  unsinged 8-bit immediate (for ARCompact)
+;;   M  unsigned 5-bit immediate (for ARCompact)
+;;   O  unsigned 7-bit immediate (for ARCompact)
+;;   P  unsigned 8-bit immediate (for ARCompact)
 ;;   N  constant '1' (for ARCompact)
 
 
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 537a3e2..422ae54 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -13026,7 +13026,7 @@
   "arm_coproc_builtin_available (VUNSPEC_<MCRR>)"
 {
   arm_const_bounds (operands[0], 0, 16);
-  arm_const_bounds (operands[1], 0, 8);
+  arm_const_bounds (operands[1], 0, 16);
   arm_const_bounds (operands[3], 0, (1 << 5));
   return "<mcrr>\\tp%c0, %1, %Q2, %R2, CR%c3";
 }
@@ -13041,7 +13041,7 @@
   "arm_coproc_builtin_available (VUNSPEC_<MRRC>)"
 {
   arm_const_bounds (operands[1], 0, 16);
-  arm_const_bounds (operands[2], 0, 8);
+  arm_const_bounds (operands[2], 0, 16);
   arm_const_bounds (operands[3], 0, (1 << 5));
   return "<mrrc>\\tp%c1, %2, %Q0, %R0, CR%c3";
 }
diff --git a/gcc/config/avr/specs.h b/gcc/config/avr/specs.h
index ff269bf..c95c758 100644
--- a/gcc/config/avr/specs.h
+++ b/gcc/config/avr/specs.h
@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3.  If not see
   "%(asm_errata_skip) "
 
 #define LINK_RELAX_SPEC                         \
-  "%{mrelax:--relax} "
+  "%{!r:%{mrelax:--relax}} "
 
 #undef  LINK_SPEC
 #define LINK_SPEC                               \
diff --git a/gcc/config/cris/cris.h b/gcc/config/cris/cris.h
index 1681c79..f356679 100644
--- a/gcc/config/cris/cris.h
+++ b/gcc/config/cris/cris.h
@@ -171,7 +171,7 @@ extern int cris_cpu_version;
 
 /* For the cris-*-elf subtarget.  */
 #define CRIS_ASM_SUBTARGET_SPEC \
- "--em=criself %{!march=*:%{!mcpu=*:" CRIS_DEFAULT_ASM_ARCH_OPTION "}}"
+ "--emulation=criself %{!march=*:%{!mcpu=*:" CRIS_DEFAULT_ASM_ARCH_OPTION "}}"
 
 /* FIXME: We should propagate the -melf option to make the criself
    "emulation" unless a linker script is provided (-T*), but I don't know
diff --git a/gcc/config/darwin-sections.def b/gcc/config/darwin-sections.def
index 44adcc6..76587c2 100644
--- a/gcc/config/darwin-sections.def
+++ b/gcc/config/darwin-sections.def
@@ -215,3 +215,10 @@ DEF_SECTION (objc2_method_names_section, 0,
 
 DEF_SECTION (objc2_method_types_section, 0,
 	     ".section __TEXT, __objc_methtype, cstring_literals", 1)
+
+/* ASAN sections.  */
+
+DEF_SECTION (asan_string_section, 0, ".section __TEXT, __asan_cstring", 0)
+DEF_SECTION (asan_globals_section, 0, ".section __DATA, __asan_globals", 0)
+DEF_SECTION (asan_liveness_section, 0,
+	     ".section __DATA,__asan_liveness,regular,live_support", 0)
diff --git a/gcc/config/darwin.cc b/gcc/config/darwin.cc
index be2daed..75ac356 100644
--- a/gcc/config/darwin.cc
+++ b/gcc/config/darwin.cc
@@ -49,6 +49,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "optabs.h"
 #include "flags.h"
 #include "opts.h"
+#include "asan.h"
 
 /* Fix and Continue.
 
@@ -1298,6 +1299,39 @@ darwin_encode_section_info (tree decl, rtx rtl, int first)
      SYMBOL_FLAG_EXTERNAL.  */
   default_encode_section_info (decl, rtl, first);
 
+  if (CONSTANT_CLASS_P (decl))
+    {
+      bool is_str = TREE_CODE (decl) == STRING_CST;
+      rtx sym_ref = XEXP (rtl, 0);
+
+      /* Unless this is a string cst or we are in an anchored section we have
+	 nothing more to do here.  */
+      if (!is_str && !SYMBOL_REF_HAS_BLOCK_INFO_P (sym_ref))
+	return;
+
+      tree sym_decl = SYMBOL_REF_DECL (sym_ref);
+      const char *name = XSTR (sym_ref, 0);
+      gcc_checking_assert (strncmp ("*lC", name, 3) == 0);
+
+      char *buf;
+      if (is_str)
+	{
+	  bool for_asan = (flag_sanitize & SANITIZE_ADDRESS)
+			   && asan_protect_global (CONST_CAST_TREE (decl));
+	  /* When we are generating code for sanitized strings, the string
+	     internal symbols are made visible in the object.  */
+	  buf = xasprintf ("*%c.str.%s", for_asan ? 'l' : 'L', &name[3]);
+	}
+      else
+	/* Lets identify anchored constants with a different prefix, for the
+	   sake of inspection only.  */
+	buf = xasprintf ("*LaC%s", &name[3]);
+      if (sym_decl)
+	DECL_NAME (sym_decl) = get_identifier (buf);
+      XSTR (sym_ref, 0) = ggc_strdup (buf);
+      free (buf);
+    }
+
   if (! VAR_OR_FUNCTION_DECL_P (decl))
     return;
 
@@ -1683,6 +1717,17 @@ machopic_select_section (tree decl,
 
   ro = TREE_READONLY (decl) || TREE_CONSTANT (decl) ;
 
+  /* Trump categorize_decl_for_section () for ASAN stuff - the Darwin
+     categorisations are special.  */
+  if (flag_sanitize & SANITIZE_ADDRESS)
+    {
+      if (TREE_CODE (decl) == STRING_CST
+	  && asan_protect_global (CONST_CAST_TREE (decl)))
+	{
+	  return darwin_sections[asan_string_section];
+	}
+    }
+
   switch (categorize_decl_for_section (decl, reloc))
     {
     case SECCAT_TEXT:
@@ -1699,7 +1744,12 @@ machopic_select_section (tree decl,
       break;
 
     case SECCAT_RODATA_MERGE_STR_INIT:
-      base_section = darwin_mergeable_string_section (DECL_INITIAL (decl), align);
+      if ((flag_sanitize & SANITIZE_ADDRESS)
+	   && asan_protect_global (CONST_CAST_TREE (decl)))
+	/* or !flag_merge_constants */
+	return darwin_sections[asan_string_section];
+      else
+	return darwin_mergeable_string_section (DECL_INITIAL (decl), align);
       break;
 
     case SECCAT_RODATA_MERGE_CONST:
@@ -3297,11 +3347,16 @@ darwin_use_anchors_for_symbol_p (const_rtx symbol)
 {
   if (DARWIN_SECTION_ANCHORS && flag_section_anchors)
     {
-      section *sect;
-      /* If the section contains a zero-sized object it's ineligible.  */
-      sect = SYMBOL_REF_BLOCK (symbol)->sect;
-      /* This should have the effect of disabling anchors for vars that follow
-         any zero-sized one, in a given section.  */
+      tree decl = SYMBOL_REF_DECL (symbol);
+      /* If the symbol would be linker-visible, then it can split at that
+	 so we must disallow.  This is more strict than the default impl.
+	 TODO: add other cases.  */
+      if (decl && DECL_P (decl)
+	  && (TREE_PUBLIC (decl) || !DECL_ARTIFICIAL (decl)))
+	return false;
+
+      /* We mark sections containing unsuitable entries.  */
+      section *sect = SYMBOL_REF_BLOCK (symbol)->sect;
       if (sect->common.flags & SECTION_NO_ANCHOR)
 	return false;
 
diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h
index 9b9a3fe..c3e28e2 100644
--- a/gcc/config/darwin.h
+++ b/gcc/config/darwin.h
@@ -287,6 +287,19 @@ extern GTY(()) int darwin_ms_struct;
 #define DARWIN_RDYNAMIC "%{rdynamic:%nrdynamic is not supported}"
 #endif
 
+#if LD64_HAS_NO_DEDUPLICATE
+/* What we want is "when the optimization level is debug OR when it is
+   a compile & link job with implied O0 optimization".  */
+#define DARWIN_LD_NO_DEDUPLICATE \
+  "%{O0|O1|O|Og: -no_deduplicate} \
+   %{!O*:\
+     %{.c|.cc|.C|.cpp|.cp|.c++|.cxx|.CPP|.m|.mm|.s|.S|.i|.ii|.mi|.mii|\
+       .f|.for|.ftn|.fpp|.f90|.f95|.f03|.f08|.f77|.F|.F90|.F95|.F03|.F08|\
+       .d|.mod: -no_deduplicate }} "
+#else
+#define DARWIN_LD_NO_DEDUPLICATE ""
+#endif
+
 #if LD64_HAS_MACOS_VERSION_MIN
 # define DARWIN_PLATFORM_ID \
   "%{mmacosx-version-min=*:-macos_version_min %*} "
@@ -403,10 +416,14 @@ extern GTY(()) int darwin_ms_struct;
     %(linker)" \
     DARWIN_LD_DEMANGLE \
     LINK_PLUGIN_SPEC \
+    DARWIN_LD_NO_DEDUPLICATE \
     "%{flto*:%<fcompare-debug*} \
      %{flto} %{fno-lto} %{flto=*} \
-    %l " \
+     %{static}%{!static:%{!dynamic:-dynamic}} \
+     %{force_cpusubtype_ALL:-arch %(darwin_arch)} \
+     %{!force_cpusubtype_ALL:-arch %(darwin_subarch)} "\
     DARWIN_PLATFORM_ID \
+    " %l " \
     LINK_COMPRESS_DEBUG_SPEC \
    "%X %{s} %{t} %{Z} %{u*} \
     %{e*} %{r} \
@@ -493,9 +510,8 @@ extern GTY(()) int darwin_ms_struct;
    Note that options taking arguments may appear multiple times on a command
    line with different arguments each time, so put a * after their names so
    all of them get passed.  */
-#define LINK_SPEC  \
-  "%{static}%{!static:%{!dynamic:-dynamic}} \
-   %:remove-outfile(-ldl) \
+#define LINK_SPEC \
+   "%:remove-outfile(-ldl) \
    %:remove-outfile(-lm) \
    %:remove-outfile(-lpthread) \
    %{fgnu-runtime: %{static|static-libgcc: \
@@ -511,9 +527,7 @@ extern GTY(()) int darwin_ms_struct;
    %{static|static-libgm2:%:replace-outfile(-lm2iso libm2iso.a%s)}\
    %{static|static-libgm2:%:replace-outfile(-lm2min libm2min.a%s)}\
    %{static|static-libgm2:%:replace-outfile(-lm2log libm2log.a%s)}\
-   %{static|static-libgm2:%:replace-outfile(-lm2cor libm2cor.a%s)}\
-  %{force_cpusubtype_ALL:-arch %(darwin_arch)} \
-   %{!force_cpusubtype_ALL:-arch %(darwin_subarch)} "\
+   %{static|static-libgm2:%:replace-outfile(-lm2cor libm2cor.a%s)} "\
    LINK_SYSROOT_SPEC \
    "%{!multiply_defined*:%{shared-libgcc: \
      %:version-compare(< 10.5 mmacosx-version-min= -multiply_defined) \
@@ -1005,6 +1019,8 @@ extern GTY(()) section * darwin_sections[NUM_DARWIN_SECTIONS];
       sprintf (LABEL, "*%s%ld", "lASAN", (long)(NUM));\
     else if (strcmp ("LTRAMP", PREFIX) == 0)	\
       sprintf (LABEL, "*%s%ld", "lTRAMP", (long)(NUM));\
+    else if (strncmp ("LANCHOR", PREFIX, 7) == 0)	\
+      sprintf (LABEL, "*%s%ld", "lANCHOR", (long)(NUM));\
     else						\
       sprintf (LABEL, "*%s%ld", PREFIX, (long)(NUM));	\
   } while (0)
diff --git a/gcc/config/h8300/addsub.md b/gcc/config/h8300/addsub.md
index 32eba9d..f153625 100644
--- a/gcc/config/h8300/addsub.md
+++ b/gcc/config/h8300/addsub.md
@@ -271,7 +271,7 @@
 			     (match_operand:QHSI 2 "register_operand" "r"))
 		(match_dup 1)))
    (set (match_operand:QHSI 0 "register_operand" "=r")
-	(plus (match_dup 1) (match_dup 2)))
+	(plus:QHSI (match_dup 1) (match_dup 2)))
    (clobber (reg:CC CC_REG))]
   ""
 {
diff --git a/gcc/config/h8300/jumpcall.md b/gcc/config/h8300/jumpcall.md
index 4e63408..44847e4 100644
--- a/gcc/config/h8300/jumpcall.md
+++ b/gcc/config/h8300/jumpcall.md
@@ -156,7 +156,7 @@
   "#"
   "&& reload_completed"
   [(set (reg:CCZ CC_REG)
-	(eq (zero_extract:HSI (match_dup 1) (const_int 1) (match_dup 2))
+	(eq:CCZ (zero_extract:HSI (match_dup 1) (const_int 1) (match_dup 2))
 	    (const_int 0)))
    (set (pc)
 	(if_then_else (match_op_dup 3 [(reg:CCZ CC_REG) (const_int 0)])
@@ -181,7 +181,7 @@
 			   (lshiftrt:SI (match_dup 1) (const_int 16))))
 	      (clobber (reg:CC CC_REG))])
    (set (reg:CCZ CC_REG)
-	(eq (zero_extract:SI (match_dup 4) (const_int 1) (match_dup 2))
+	(eq:CCZ (zero_extract:SI (match_dup 4) (const_int 1) (match_dup 2))
 	    (const_int 0)))
    (set (pc)
 	(if_then_else (match_op_dup 3 [(reg:CCZ CC_REG) (const_int 0)])
@@ -288,7 +288,7 @@
   })
 
 (define_insn "call_insn_<mode>"
-  [(call (mem:QI (match_operand 0 "call_insn_operand" "Cr"))
+  [(call (mem:QI (match_operand:P 0 "call_insn_operand" "Cr"))
 	         (match_operand:P 1 "general_operand" "g"))]
   "!SIBLING_CALL_P (insn)"
 {
@@ -326,7 +326,7 @@
 
 (define_insn "call_value_insn_<mode>"
   [(set (match_operand 0 "" "=r")
-	(call (mem:QI (match_operand 1 "call_insn_operand" "Cr"))
+	(call (mem:QI (match_operand:P 1 "call_insn_operand" "Cr"))
 		      (match_operand:P 2 "general_operand" "g")))]
   "!SIBLING_CALL_P (insn)"
 {
@@ -358,7 +358,7 @@
   })
 
 (define_insn "sibcall_insn_<mode>"
-  [(call (mem:QI (match_operand 0 "call_insn_operand" "Cr"))
+  [(call (mem:QI (match_operand:P 0 "call_insn_operand" "Cr"))
 	         (match_operand:P 1 "general_operand" "g"))]
   "SIBLING_CALL_P (insn)"
 {
@@ -396,7 +396,7 @@
 
 (define_insn "sibcall_value_insn_<mode>"
   [(set (match_operand 0 "" "=r")
-	(call (mem:QI (match_operand 1 "call_insn_operand" "Cr"))
+	(call (mem:QI (match_operand:P 1 "call_insn_operand" "Cr"))
 		      (match_operand:P 2 "general_operand" "g")))]
   "SIBLING_CALL_P (insn)"
 {
diff --git a/gcc/config/h8300/testcompare.md b/gcc/config/h8300/testcompare.md
index 694c9e6..3b43381 100644
--- a/gcc/config/h8300/testcompare.md
+++ b/gcc/config/h8300/testcompare.md
@@ -28,7 +28,7 @@
 ;;
 (define_insn ""
   [(set (reg:CCZ CC_REG)
-	(eq (zero_extract:HSI (match_operand:HSI 0 "register_operand" "r")
+	(eq:CCZ (zero_extract:HSI (match_operand:HSI 0 "register_operand" "r")
 			      (const_int 1)
 			      (match_operand 1 "const_int_operand" "n"))
 	    (const_int 0)))]
@@ -54,7 +54,7 @@
 
 (define_insn "*tsthi_upper"
   [(set (reg:CCZN CC_REG)
-	(compare (and:HI (match_operand:HI 0 "register_operand" "r")
+	(compare:CCZN (and:HI (match_operand:HI 0 "register_operand" "r")
 			 (const_int -256))
 		 (const_int 0)))]
   "reload_completed"
@@ -63,7 +63,7 @@
 
 (define_insn "*tsthi_upper_z"
   [(set (reg:CCZ CC_REG)
-	(compare (and:HI (match_operand:HI 0 "register_operand" "r")
+	(compare:CCZ (and:HI (match_operand:HI 0 "register_operand" "r")
 			 (const_int -256))
 		 (const_int 0)))]
   "reload_completed"
@@ -72,7 +72,7 @@
 
 (define_insn "*tstsi_upper"
   [(set (reg:CCZN CC_REG)
-	(compare (and:SI (match_operand:SI 0 "register_operand" "r")
+	(compare:CCZN (and:SI (match_operand:SI 0 "register_operand" "r")
 			 (const_int -65536))
 		 (const_int 0)))]
   "reload_completed"
@@ -81,7 +81,7 @@
 
 (define_insn "*cmp<mode>_c"
   [(set (reg:CCC CC_REG)
-	(ltu (match_operand:QHSI 0 "h8300_dst_operand" "rQ")
+	(ltu:CCC (match_operand:QHSI 0 "h8300_dst_operand" "rQ")
 	     (match_operand:QHSI 1 "h8300_src_operand" "rQi")))]
   "reload_completed"
   {
@@ -97,7 +97,7 @@
 
 (define_insn "*cmpqi_z"
   [(set (reg:CCZ CC_REG)
-	(eq (match_operand:QI 0 "h8300_dst_operand" "rQ")
+	(eq:CCZ (match_operand:QI 0 "h8300_dst_operand" "rQ")
 	    (match_operand:QI 1 "h8300_src_operand" "rQi")))]
   "reload_completed"
   { return "cmp.b	%X1,%X0"; }
@@ -105,7 +105,7 @@
 
 (define_insn "*cmphi_z"
   [(set (reg:CCZ CC_REG)
-	(eq (match_operand:HI 0 "h8300_dst_operand" "rQ")
+	(eq:CCZ (match_operand:HI 0 "h8300_dst_operand" "rQ")
 	    (match_operand:HI 1 "h8300_src_operand" "rQi")))]
   "reload_completed"
   { return "cmp.w	%T1,%T0"; }
@@ -113,7 +113,7 @@
 
 (define_insn "*cmpsi_z"
   [(set (reg:CCZ CC_REG)
-	(eq (match_operand:SI 0 "h8300_dst_operand" "rQ")
+	(eq:CCZ (match_operand:SI 0 "h8300_dst_operand" "rQ")
 	    (match_operand:SI 1 "h8300_src_operand" "rQi")))]
   "reload_completed"
   { return "cmp.l	%S1,%S0"; }
@@ -121,7 +121,7 @@
 
 (define_insn "*cmpqi"
   [(set (reg:CC CC_REG)
-	(compare (match_operand:QI 0 "h8300_dst_operand" "rQ")
+	(compare:CC (match_operand:QI 0 "h8300_dst_operand" "rQ")
 		 (match_operand:QI 1 "h8300_src_operand" "rQi")))]
   "reload_completed"
   "cmp.b	%X1,%X0"
@@ -129,7 +129,7 @@
 
 (define_insn "*cmphi"
   [(set (reg:CC CC_REG)
-	(compare (match_operand:HI 0 "h8300_dst_operand" "rU,rQ")
+	(compare:CC (match_operand:HI 0 "h8300_dst_operand" "rU,rQ")
 		 (match_operand:HI 1 "h8300_src_operand" "P3>X,rQi")))]
   "reload_completed"
 {
@@ -150,7 +150,7 @@
 
 (define_insn "cmpsi"
   [(set (reg:CC CC_REG)
-	(compare (match_operand:SI 0 "h8300_dst_operand" "r,rQ")
+	(compare:CC (match_operand:SI 0 "h8300_dst_operand" "r,rQ")
 		 (match_operand:SI 1 "h8300_src_operand" "P3>X,rQi")))]
   "reload_completed"
 {
@@ -176,7 +176,7 @@
 (define_peephole2
   [(match_scratch:QHSI 1 "r")
    (set (reg:CC CC_REG)
-	(compare (match_operand:QHSI 0 "memory_operand" "")
+	(compare:CC (match_operand:QHSI 0 "memory_operand" "")
 		 (const_int 0)))]
   "!mode_dependent_address_p (XEXP (operands[0], 0), MEM_ADDR_SPACE (operands[0]))"
   [(parallel [(set (reg:CCZN CC_REG) (compare:CCZN (match_dup 0) (const_int 0)))
@@ -187,7 +187,7 @@
 (define_peephole2
   [(match_scratch:QHSI 1 "r")
    (set (reg:CC CC_REG)
-	(compare (match_operand:QHSI 0 "memory_operand" "")
+	(compare:CC (match_operand:QHSI 0 "memory_operand" "")
 		 (const_int 0)))]
   "mode_dependent_address_p (XEXP (operands[0], 0), MEM_ADDR_SPACE (operands[0]))"
   [(parallel [(set (match_dup 1) (match_dup 0)) (clobber (reg:CC CC_REG))])
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 12cec61..3278f1f 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -3151,7 +3151,7 @@ ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
 }
 
 /* Expand floating point op0 <=> op1, i.e.
-   dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2.  */
+   dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128.  */
 
 void
 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
@@ -3264,7 +3264,7 @@ ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
   if (l2)
     {
       emit_label (l2);
-      emit_move_insn (dest, op2 == const0_rtx ? const2_rtx : op2);
+      emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2);
     }
   emit_label (lend);
 }
@@ -8241,8 +8241,10 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem,
       unsigned HOST_WIDE_INT countval = UINTVAL (count);
       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
       unsigned int destalign = MEM_ALIGN (destmem);
+      cfun->machine->by_pieces_in_use = true;
       move_by_pieces (destmem, srcmem, epilogue_size, destalign,
 		      RETURN_BEGIN);
+      cfun->machine->by_pieces_in_use = false;
       return;
     }
   if (max_size > 8)
@@ -8405,8 +8407,8 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
 
 /* Callback routine for store_by_pieces.  Return the RTL of a register
    containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
-   is a word or a word vector register.  If PREV_P isn't nullptr, it
-   has the RTL info from the previous iteration.  */
+   is an integer or a word vector register.  If PREV_P isn't nullptr,
+   it has the RTL info from the previous iteration.  */
 
 static rtx
 setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
@@ -8435,10 +8437,6 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
   rtx op = (rtx) op_p;
   machine_mode op_mode = GET_MODE (op);
 
-  gcc_assert (op_mode == word_mode
-	      || (VECTOR_MODE_P (op_mode)
-		  && GET_MODE_INNER (op_mode) == word_mode));
-
   if (VECTOR_MODE_P (mode))
     {
       gcc_assert (GET_MODE_INNER (mode) == QImode);
@@ -8460,16 +8458,17 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
       return tmp;
     }
 
-  target = gen_reg_rtx (word_mode);
   if (VECTOR_MODE_P (op_mode))
     {
+      gcc_assert (GET_MODE_INNER (op_mode) == word_mode);
+      target = gen_reg_rtx (word_mode);
       op = gen_rtx_SUBREG (word_mode, op, 0);
       emit_move_insn (target, op);
     }
   else
     target = op;
 
-  if (mode == word_mode)
+  if (mode == GET_MODE (target))
     return target;
 
   rtx tmp = gen_reg_rtx (mode);
@@ -8490,9 +8489,11 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
       unsigned HOST_WIDE_INT countval = UINTVAL (count);
       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
       unsigned int destalign = MEM_ALIGN (destmem);
+      cfun->machine->by_pieces_in_use = true;
       store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
 		       vec_value ? vec_value : value, destalign, true,
 		       RETURN_BEGIN);
+      cfun->machine->by_pieces_in_use = false;
       return;
     }
   if (max_size > 32)
@@ -27034,6 +27035,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
   return target;
 }
 
+/* GF2P8AFFINEQB matrixes to implement shift and rotate.  */
+
+static const uint64_t matrix_ashift[8] =
+{
+  0,
+  0x0001020408102040, /* 1 l */
+  0x0000010204081020, /* 2 l */
+  0x0000000102040810, /* 3 l */
+  0x0000000001020408, /* 4 l */
+  0x0000000000010204, /* 5 l */
+  0x0000000000000102, /* 6 l */
+  0x0000000000000001  /* 7 l */
+};
+
+static const uint64_t matrix_lshiftrt[8] =
+{
+  0,
+  0x0204081020408000, /* 1 r */
+  0x0408102040800000, /* 2 r */
+  0x0810204080000000, /* 3 r */
+  0x1020408000000000, /* 4 r */
+  0x2040800000000000, /* 5 r */
+  0x4080000000000000, /* 6 r */
+  0x8000000000000000  /* 7 r */
+};
+
+static const uint64_t matrix_ashiftrt[8] =
+{
+  0,
+  0x0204081020408080, /* 1 r */
+  0x0408102040808080, /* 2 r */
+  0x0810204080808080, /* 3 r */
+  0x1020408080808080, /* 4 r */
+  0x2040808080808080, /* 5 r */
+  0x4080808080808080, /* 6 r */
+  0x8080808080808080  /* 7 r */
+};
+
+static const uint64_t matrix_rotate[8] =
+{
+  0,
+  0x8001020408102040, /* 1 rol8 */
+  0x4080010204081020, /* 2 rol8 */
+  0x2040800102040810, /* 3 rol8 */
+  0x1020408001020408, /* 4 rol8 */
+  0x0810204080010204, /* 5 rol8 */
+  0x0408102040800102, /* 6 rol8 */
+  0x0204081020408001  /* 7 rol8 */
+};
+
+static const uint64_t matrix_rotatert[8] =
+{
+  0,
+  0x0204081020408001, /* 1 ror8 */
+  0x0408102040800102, /* 2 ror8 */
+  0x0810204080010204, /* 3 ror8 */
+  0x1020408001020408, /* 4 ror8 */
+  0x2040800102040810, /* 5 ror8 */
+  0x4080010204081020, /* 6 ror8 */
+  0x8001020408102040  /* 7 ror8 */
+};
+
+/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
+   for CODE and shift count COUNT into register with vector of size of SRC.  */
+
+rtx
+ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
+{
+  machine_mode mode = GET_MODE (src);
+  const uint64_t *matrix;
+  unsigned shift = INTVAL (count) & 7;
+  gcc_assert (shift > 0 && shift < 8);
+
+  switch (code)
+    {
+    case ASHIFT:
+      matrix = matrix_ashift;
+      break;
+    case ASHIFTRT:
+      matrix = matrix_ashiftrt;
+      break;
+    case LSHIFTRT:
+      matrix = matrix_lshiftrt;
+      break;
+    case ROTATE:
+      matrix = matrix_rotate;
+      break;
+    case ROTATERT:
+      matrix = matrix_rotatert;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  int nelts = GET_MODE_NUNITS (mode);
+  rtvec vec = rtvec_alloc (nelts);
+  uint64_t ma = matrix[shift];
+  for (int i = 0; i < nelts; i++)
+    RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
+
+  return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
+}
+
 /* Trunc a vector to a narrow vector, like v4di -> v4si.  */
 
 void
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 9941e61..0608dd2 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3089,10 +3089,13 @@ enum x86_cse_kind
 {
   X86_CSE_CONST0_VECTOR,
   X86_CSE_CONSTM1_VECTOR,
-  X86_CSE_VEC_DUP
+  X86_CSE_VEC_DUP,
+  X86_CSE_TLS_GD,
+  X86_CSE_TLS_LD_BASE,
+  X86_CSE_TLSDESC
 };
 
-struct redundant_load
+struct redundant_pattern
 {
   /* Bitmap of basic blocks with broadcast instructions.  */
   auto_bitmap bbs;
@@ -3100,6 +3103,8 @@ struct redundant_load
   auto_bitmap insns;
   /* The broadcast inner scalar.  */
   rtx val;
+  /* The actual redundant source value for UNSPEC_TLSDESC.  */
+  rtx tlsdesc_val;
   /* The inner scalar mode.  */
   machine_mode mode;
   /* The instruction which sets the inner scalar.  Nullptr if the inner
@@ -3130,7 +3135,7 @@ struct redundant_load
 
 static void
 ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
-			      redundant_load *load = nullptr)
+			      redundant_pattern *load = nullptr)
 {
   basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
   /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop
@@ -3639,6 +3644,8 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
 	 Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
 	 integer constant.  */
       op = src;
+      if (mode != GET_MODE (reg))
+	op = gen_int_mode (INTVAL (src), mode);
       *insn_p = nullptr;
     }
   else
@@ -3679,25 +3686,719 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
   return op;
 }
 
-/* At entry of the nearest common dominator for basic blocks with vector
-   CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
-   vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
-   uses.
+/* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and
+   put the updated instruction in UPDATED_TLS_INSNS.  */
 
-   NB: We want to generate only a single widest vector set to cover the
-   whole function.  The LCM algorithm isn't appropriate here since it
-   may place a vector set inside the loop.  */
+static void
+replace_tls_call (rtx src, auto_bitmap &tls_call_insns,
+		  auto_bitmap &updated_tls_insns)
+{
+  bitmap_iterator bi;
+  unsigned int id;
 
-static unsigned int
-remove_redundant_vector_load (void)
+  EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
+    {
+      rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
+
+      /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
+	 allowed.  */
+      if (!CALL_P (insn))
+	{
+	  attr_tls64 tls64 = get_attr_tls64 (insn);
+	  if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
+	    gcc_unreachable ();
+	}
+
+      rtx pat = PATTERN (insn);
+      gcc_assert (GET_CODE (pat) == PARALLEL);
+      rtx set = XVECEXP (pat, 0, 0);
+      gcc_assert (GET_CODE (set) == SET);
+      rtx dest = SET_DEST (set);
+
+      set = gen_rtx_SET (dest, src);
+      rtx_insn *set_insn = emit_insn_after (set, insn);
+      if (recog_memoized (set_insn) < 0)
+	gcc_unreachable ();
+
+      /* Put SET_INSN in UPDATED_TLS_INSNS.  */
+      bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn));
+
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nReplace:\n\n");
+	  print_rtl_single (dump_file, insn);
+	  fprintf (dump_file, "\nwith:\n\n");
+	  print_rtl_single (dump_file, set_insn);
+	  fprintf (dump_file, "\n");
+	}
+
+      /* Delete the CALL insn.  */
+      delete_insn (insn);
+
+      df_insn_rescan (set_insn);
+    }
+}
+
+/* Return the basic block which dominates all basic blocks which set
+   hard register REGNO used in basic block BB.  */
+
+static basic_block
+ix86_get_dominator_for_reg (unsigned int regno, basic_block bb)
+{
+  basic_block set_bb;
+  auto_bitmap set_bbs;
+
+  /* Get all BBs which set REGNO and dominate the current BB from all
+     DEFs of REGNO.  */
+  for (df_ref def = DF_REG_DEF_CHAIN (regno);
+       def;
+       def = DF_REF_NEXT_REG (def))
+    if (!DF_REF_IS_ARTIFICIAL (def)
+	&& !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER)
+	&& !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER))
+      {
+	set_bb = DF_REF_BB (def);
+	if (dominated_by_p (CDI_DOMINATORS, bb, set_bb))
+	  bitmap_set_bit (set_bbs, set_bb->index);
+      }
+
+  bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
+  return bb;
+}
+
+/* Mark FLAGS register as live in DATA, a bitmap of live caller-saved
+   registers, if DEST is FLAGS register.  */
+
+static void
+ix86_check_flags_reg (rtx dest, const_rtx, void *data)
+{
+  auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data;
+  if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
+    bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG);
+}
+
+/* Emit a TLS_SET instruction of KIND in basic block BB.   Store the
+   insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P
+   for emit_insn_after.  UPDATED_GNU_TLS_INSNS contains instructions
+   which replace the GNU TLS instructions.  UPDATED_GNU2_TLS_INSNS
+   contains instructions which replace the GNU2 TLS instructions.  */
+
+static rtx_insn *
+ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
+		    rtx_insn **before_p, rtx_insn **after_p,
+		    auto_bitmap &updated_gnu_tls_insns,
+		    auto_bitmap &updated_gnu2_tls_insns)
+{
+  rtx_insn *tls_insn;
+
+  do
+    {
+      rtx_insn *insn = BB_HEAD (bb);
+      while (insn && !NONDEBUG_INSN_P (insn))
+	{
+	  if (insn == BB_END (bb))
+	    {
+	      /* This must be the beginning basic block:
+
+		 (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
+
+		 or a basic block with only a label:
+
+		 (code_label 78 11 77 3 14 (nil) [1 uses])
+		 (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
+
+		 or a basic block with only a debug marker:
+
+		 (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
+		 (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+
+	       */
+	      gcc_assert (DEBUG_INSN_P (insn)
+			  || (NOTE_P (insn)
+			      && ((NOTE_KIND (insn)
+				   == NOTE_INSN_FUNCTION_BEG)
+				  || (NOTE_KIND (insn)
+				      == NOTE_INSN_BASIC_BLOCK))));
+	      insn = NULL;
+	      break;
+	    }
+	  insn = NEXT_INSN (insn);
+	}
+
+      /* TLS_GD and TLS_LD_BASE instructions are normal functions which
+	 clobber caller-saved registers.  TLSDESC instructions only
+	 clobber FLAGS.  If any registers clobbered by TLS instructions
+	 are live in this basic block, we must insert TLS instructions
+	 after all live registers clobbered are dead.  */
+
+      auto_bitmap live_caller_saved_regs;
+      bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
+
+      if (bitmap_bit_p (in, FLAGS_REG))
+	bitmap_set_bit (live_caller_saved_regs, FLAGS_REG);
+
+      unsigned int i;
+
+      /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE
+	 instructions.  */
+      if (kind != X86_CSE_TLSDESC)
+	for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+	  if (call_used_regs[i]
+	      && !fixed_regs[i]
+	      && bitmap_bit_p (in, i))
+	    bitmap_set_bit (live_caller_saved_regs, i);
+
+      if (bitmap_empty_p (live_caller_saved_regs))
+	{
+	  if (insn == BB_HEAD (bb))
+	    {
+	      *before_p = insn;
+	      tls_insn = emit_insn_before (tls_set, insn);
+	    }
+	  else
+	    {
+	      /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the
+		 beginning basic block:
+
+		 (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
+
+		 or after NOTE_INSN_BASIC_BLOCK in a basic block with
+		 only a label:
+
+		 (code_label 78 11 77 3 14 (nil) [1 uses])
+		 (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
+
+		 or after debug marker in a basic block with only a
+		 debug marker:
+
+		 (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
+		 (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+
+	       */
+	      insn = insn ? PREV_INSN (insn) : BB_END (bb);
+	      *after_p = insn;
+	      tls_insn = emit_insn_after (tls_set, insn);
+	    }
+	  return tls_insn;
+	}
+
+      bool repeat = false;
+
+      /* Search for REG_DEAD notes in this basic block.  */
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+
+	  /* NB: Conditional jump is the only instruction which reads
+	     flags register and changes control flow.  We can never
+	     place the TLS call after unconditional jump.  */
+	  if (JUMP_P (insn))
+	    {
+	      /* This must be a conditional jump.  */
+	      rtx label = JUMP_LABEL (insn);
+	      if (label == nullptr
+		  || ANY_RETURN_P (label)
+		  || !(LABEL_P (label) || SYMBOL_REF_P (label)))
+		gcc_unreachable ();
+
+	      /* Place the call before all FLAGS_REG setting BBs since
+		 we can't place a call before nor after a conditional
+		 jump.  */
+	      bb = ix86_get_dominator_for_reg (FLAGS_REG, bb);
+
+	      /* Start over again.  */
+	      repeat = true;
+	      break;
+	    }
+
+	  if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn)))
+	    {
+	      /* Insert the __tls_get_addr call before INSN which
+		 replaces a __tls_get_addr call.  */
+	      *before_p = insn;
+	      tls_insn = emit_insn_before (tls_set, insn);
+	      return tls_insn;
+	    }
+
+	  if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn)))
+	    {
+	      /* Mark FLAGS register as dead since FLAGS register
+		 would be clobbered by the GNU2 TLS instruction.  */
+	      bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG);
+	      continue;
+	    }
+
+	  /* Check if FLAGS register is live.  */
+	  note_stores (insn, ix86_check_flags_reg,
+		       &live_caller_saved_regs);
+
+	  rtx link;
+	  for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
+	    if (REG_NOTE_KIND (link) == REG_DEAD
+		&& REG_P (XEXP (link, 0)))
+	      {
+		/* Mark the live caller-saved register as dead.  */
+		for (i = REGNO (XEXP (link, 0));
+		     i < END_REGNO (XEXP (link, 0));
+		     i++)
+		  if (i < FIRST_PSEUDO_REGISTER)
+		    bitmap_clear_bit (live_caller_saved_regs, i);
+
+		if (bitmap_empty_p (live_caller_saved_regs))
+		  {
+		    *after_p = insn;
+		    tls_insn = emit_insn_after (tls_set, insn);
+		    return tls_insn;
+		  }
+	      }
+	}
+
+      /* NB: Start over again for conditional jump.  */
+      if (repeat)
+	continue;
+
+      gcc_assert (!bitmap_empty_p (live_caller_saved_regs));
+
+      /* If any live caller-saved registers aren't dead at the end of
+	 this basic block, get the basic block which dominates all
+	 basic blocks which set the remaining live registers.  */
+      auto_bitmap set_bbs;
+      bitmap_iterator bi;
+      unsigned int id;
+      EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi)
+	{
+	  basic_block set_bb = ix86_get_dominator_for_reg (id, bb);
+	  bitmap_set_bit (set_bbs, set_bb->index);
+	}
+      bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
+    }
+  while (true);
+}
+
+/* Generate a TLS call of KIND with VAL and copy the call result to DEST,
+   at entry of the nearest dominator for basic block map BBS, which is in
+   the fake loop that contains the whole function, so that there is only
+   a single TLS CALL of KIND with VAL in the whole function.
+   UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS
+   instructions.  UPDATED_GNU2_TLS_INSNS contains instructions which
+   replace the GNU2 TLS instructions.  If TLSDESC_SET isn't nullptr,
+   insert it before the TLS call.  */
+
+static void
+ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
+			    auto_bitmap &bbs,
+			    auto_bitmap &updated_gnu_tls_insns,
+			    auto_bitmap &updated_gnu2_tls_insns,
+			    rtx tlsdesc_set = nullptr)
+{
+  basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
+  while (bb->loop_father->latch
+	 != EXIT_BLOCK_PTR_FOR_FN (cfun))
+    bb = get_immediate_dominator (CDI_DOMINATORS,
+				  bb->loop_father->header);
+
+  rtx rax = nullptr, rdi;
+  rtx eqv = nullptr;
+  rtx caddr;
+  rtx set;
+  rtx clob;
+  rtx symbol;
+  rtx tls;
+
+  switch (kind)
+    {
+    case X86_CSE_TLS_GD:
+      rax = gen_rtx_REG (Pmode, AX_REG);
+      rdi = gen_rtx_REG (Pmode, DI_REG);
+      caddr = ix86_tls_get_addr ();
+
+      symbol = XVECEXP (val, 0, 0);
+      tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
+
+      if (GET_MODE (symbol) != Pmode)
+	symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
+      eqv = symbol;
+      break;
+
+    case X86_CSE_TLS_LD_BASE:
+      rax = gen_rtx_REG (Pmode, AX_REG);
+      rdi = gen_rtx_REG (Pmode, DI_REG);
+      caddr = ix86_tls_get_addr ();
+
+      tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
+
+      /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
+	 to share the LD_BASE result with other LD model accesses.  */
+      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+			    UNSPEC_TLS_LD_BASE);
+
+      break;
+
+    case X86_CSE_TLSDESC:
+      set = gen_rtx_SET (dest, val);
+      clob = gen_rtx_CLOBBER (VOIDmode,
+			      gen_rtx_REG (CCmode, FLAGS_REG));
+      tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Emit the TLS CALL insn.  */
+  rtx_insn *before = nullptr;
+  rtx_insn *after = nullptr;
+  rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before,
+					   &after,
+					   updated_gnu_tls_insns,
+					   updated_gnu2_tls_insns);
+
+  rtx_insn *tlsdesc_insn = nullptr;
+  if (tlsdesc_set)
+    {
+      rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
+      rtx src = copy_rtx (SET_SRC (tlsdesc_set));
+      tlsdesc_set = gen_rtx_SET (dest, src);
+      tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
+    }
+
+  if (kind != X86_CSE_TLSDESC)
+    {
+      RTL_CONST_CALL_P (tls_insn) = 1;
+
+      /* Indicate that this function can't jump to non-local gotos.  */
+      make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
+    }
+
+  if (recog_memoized (tls_insn) < 0)
+    gcc_unreachable ();
+
+  if (dump_file)
+    {
+      if (after)
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  if (tlsdesc_insn)
+	    print_rtl_single (dump_file, tlsdesc_insn);
+	  print_rtl_single (dump_file, tls_insn);
+	  fprintf (dump_file, "\nafter:\n\n");
+	  print_rtl_single (dump_file, after);
+	  fprintf (dump_file, "\n");
+	}
+      else
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  if (tlsdesc_insn)
+	    print_rtl_single (dump_file, tlsdesc_insn);
+	  print_rtl_single (dump_file, tls_insn);
+	  fprintf (dump_file, "\nbefore:\n\n");
+	  print_rtl_single (dump_file, before);
+	  fprintf (dump_file, "\n");
+	}
+    }
+
+  if (kind != X86_CSE_TLSDESC)
+    {
+      /* Copy RAX to DEST.  */
+      set = gen_rtx_SET (dest, rax);
+      rtx_insn *set_insn = emit_insn_after (set, tls_insn);
+      set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  print_rtl_single (dump_file, set_insn);
+	  fprintf (dump_file, "\nafter:\n\n");
+	  print_rtl_single (dump_file, tls_insn);
+	  fprintf (dump_file, "\n");
+	}
+    }
+}
+
+namespace {
+
+const pass_data pass_data_x86_cse =
+{
+  RTL_PASS, /* type */
+  "x86_cse", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_x86_cse : public rtl_opt_pass
+{
+public:
+  pass_x86_cse (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_x86_cse, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *fun) final override
+    {
+      return (TARGET_SSE2
+	      && optimize
+	      && optimize_function_for_speed_p (fun));
+    }
+
+  unsigned int execute (function *) final override
+    {
+      return x86_cse ();
+    }
+
+private:
+  /* The redundant source value.  */
+  rtx val;
+  /* The actual redundant source value for UNSPEC_TLSDESC.  */
+  rtx tlsdesc_val;
+  /* The instruction which defines the redundant value.  */
+  rtx_insn *def_insn;
+  /* Mode of the destination of the candidate redundant instruction.  */
+  machine_mode mode;
+  /* Mode of the source of the candidate redundant instruction.  */
+  machine_mode scalar_mode;
+  /* The classification of the candidate redundant instruction.  */
+  x86_cse_kind kind;
+
+  unsigned int x86_cse (void);
+  bool candidate_gnu_tls_p (rtx_insn *, attr_tls64);
+  bool candidate_gnu2_tls_p (rtx, attr_tls64);
+  bool candidate_vector_p (rtx);
+  rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx);
+}; // class pass_x86_cse
+
+/* Return the instruction which sets REG from TLS_SYMBOL.  */
+
+rtx_insn *
+pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg,
+					const_rtx tls_symbol)
+{
+  rtx_insn *set_insn = nullptr;
+  for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
+       ref;
+       ref = DF_REF_NEXT_REG (ref))
+    {
+      if (DF_REF_IS_ARTIFICIAL (ref))
+	return nullptr;
+
+      set_insn = DF_REF_INSN (ref);
+      if (get_attr_tls64 (set_insn) != TLS64_LEA)
+	return nullptr;
+
+      rtx tls_set = PATTERN (set_insn);
+      rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0);
+      if (!rtx_equal_p (tls_symbol, tls_src))
+	return nullptr;
+    }
+
+  return set_insn;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+   INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE.  */
+
+bool
+pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64)
+{
+  if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
+    return false;
+
+  /* Record the redundant TLS CALLs for 64-bit:
+
+     (parallel [
+	(set (reg:DI 0 ax)
+	     (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
+		      (const_int 0 [0])))
+	(unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
+		    (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
+	(clobber (reg:DI 5 di))])
+
+
+     and
+
+     (parallel [
+	(set (reg:DI 0 ax)
+	     (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
+		      (const_int 0 [0])))
+	(unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
+
+   */
+
+  rtx pat = PATTERN (insn);
+  rtx set = XVECEXP (pat, 0, 0);
+  gcc_assert (GET_CODE (set) == SET);
+  rtx dest = SET_DEST (set);
+  scalar_mode = mode = GET_MODE (dest);
+  val = XVECEXP (pat, 0, 1);
+  gcc_assert (GET_CODE (val) == UNSPEC);
+
+  if (tls64 == TLS64_GD)
+    kind = X86_CSE_TLS_GD;
+  else
+    kind = X86_CSE_TLS_LD_BASE;
+
+  def_insn = nullptr;
+  return true;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+   SET is UNSPEC_TLSDESC.  */
+
+bool
+pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64)
+{
+  if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
+    return false;
+
+  rtx tls_symbol;
+  rtx_insn *set_insn;
+  rtx src = SET_SRC (set);
+  val = src;
+  tlsdesc_val = src;
+  kind = X86_CSE_TLSDESC;
+
+  if (tls64 == TLS64_COMBINE)
+    {
+      /* Record 64-bit TLS64_COMBINE:
+
+	 (set (reg/f:DI 104)
+	      (plus:DI (unspec:DI [
+			  (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+			  (reg:DI 114)
+			  (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
+		       (const:DI (unspec:DI [
+				    (symbol_ref:DI ("e") [flags 0x1a])
+				  ] UNSPEC_DTPOFF))))
+
+	 (set (reg/f:DI 104)
+	      (plus:DI (unspec:DI [
+			  (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+			  (unspec:DI [
+			     (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+			  ] UNSPEC_TLSDESC)
+			  (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
+		       (const:DI (unspec:DI [
+				    (symbol_ref:DI ("e") [flags 0x1a])
+				 ] UNSPEC_DTPOFF))))
+     */
+
+      scalar_mode = mode = GET_MODE (src);
+
+      /* Since the first operand of PLUS in the source TLS_COMBINE
+	 pattern is unused, use the second operand of PLUS:
+
+	 (const:DI (unspec:DI [
+		      (symbol_ref:DI ("e") [flags 0x1a])
+		   ] UNSPEC_DTPOFF))
+
+	 as VAL to check if 2 TLS_COMBINE patterns have the same
+	 source.  */
+      val = XEXP (src, 1);
+      gcc_assert (GET_CODE (val) == CONST
+		  && GET_CODE (XEXP (val, 0)) == UNSPEC
+		      && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF
+		      && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0)));
+      def_insn = nullptr;
+      return true;
+    }
+
+  /* Record 64-bit TLS_CALL:
+
+     (set (reg:DI 101)
+	  (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
+		      (reg:DI 112)
+		      (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
+
+   */
+
+  gcc_assert (GET_CODE (src) == UNSPEC);
+  tls_symbol = XVECEXP (src, 0, 0);
+  src = XVECEXP (src, 0, 1);
+  scalar_mode = mode = GET_MODE (src);
+  gcc_assert (REG_P (src));
+
+  /* All definitions of reg:DI 129 in
+
+     (set (reg:DI 110)
+	  (unspec:DI [(symbol_ref:DI ("foo"))
+		      (reg:DI 129)
+		      (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
+
+     should have the same source as in
+
+     (set (reg:DI 129)
+	  (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
+
+   */
+
+  set_insn = tls_set_insn_from_symbol (src, tls_symbol);
+  if (!set_insn)
+    return false;
+
+  /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source.  */
+  val = tls_symbol;
+  def_insn = set_insn;
+  return true;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+  INSN is a vector broadcast instruction.  */
+
+bool
+pass_x86_cse::candidate_vector_p (rtx set)
+{
+  rtx src = SET_SRC (set);
+  rtx dest = SET_DEST (set);
+  mode = GET_MODE (dest);
+  /* Skip non-vector instruction.  */
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  /* Skip non-vector load instruction.  */
+  if (!REG_P (dest) && !SUBREG_P (dest))
+    return false;
+
+  val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
+			      &def_insn);
+  return val ? true : false;
+}
+
+/* At entry of the nearest common dominator for basic blocks with
+
+   1. Vector CONST0_RTX patterns.
+   2. Vector CONSTM1_RTX patterns.
+   3. Vector broadcast patterns.
+   4. UNSPEC_TLS_GD patterns.
+   5. UNSPEC_TLS_LD_BASE patterns.
+   6. UNSPEC_TLSDESC patterns.
+
+   generate a single pattern whose destination is used to replace the
+   source in all identical patterns.
+
+   NB: We want to generate a pattern, which is executed only once, to
+   cover the whole function.  The LCM algorithm isn't appropriate here
+   since it may place a pattern inside the loop.  */
+
+unsigned int
+pass_x86_cse::x86_cse (void)
 {
   timevar_push (TV_MACH_DEP);
 
-  auto_vec<redundant_load *> loads;
-  redundant_load *load;
+  auto_vec<redundant_pattern *> loads;
+  redundant_pattern *load;
   basic_block bb;
   rtx_insn *insn;
   unsigned int i;
+  auto_bitmap updated_gnu_tls_insns;
+  auto_bitmap updated_gnu2_tls_insns;
 
   df_set_flags (DF_DEFER_INSN_RESCAN);
 
@@ -3710,61 +4411,74 @@ remove_redundant_vector_load (void)
 	  if (!NONDEBUG_INSN_P (insn))
 	    continue;
 
+	  bool matched = false;
+	  /* Remove redundant pattens if there are more than 2 of
+	     them.  */
+	  unsigned int threshold = 2;
+
 	  rtx set = single_set (insn);
-	  if (!set)
+	  if (!set && !CALL_P (insn))
 	    continue;
 
-	  /* Record single set vector instruction with CONST0_RTX and
-	     CONSTM1_RTX source.  Record basic blocks with CONST0_RTX and
-	     CONSTM1_RTX.  Count CONST0_RTX and CONSTM1_RTX.  Record the
-	     maximum size of CONST0_RTX and CONSTM1_RTX.  */
+	  tlsdesc_val = nullptr;
 
-	  rtx dest = SET_DEST (set);
-	  machine_mode mode = GET_MODE (dest);
-	  /* Skip non-vector instruction.  */
-	  if (!VECTOR_MODE_P (mode))
-	    continue;
+	  attr_tls64 tls64 = get_attr_tls64 (insn);
+	  switch (tls64)
+	    {
+	    case TLS64_GD:
+	    case TLS64_LD_BASE:
+	      /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE.  */
+	      if (candidate_gnu_tls_p (insn, tls64))
+		break;
+	      continue;
 
-	  rtx src = SET_SRC (set);
-	  /* Skip non-vector load instruction.  */
-	  if (!REG_P (dest) && !SUBREG_P (dest))
-	    continue;
+	    case TLS64_CALL:
+	    case TLS64_COMBINE:
+	      /* Verify UNSPEC_TLSDESC.  */
+	      if (candidate_gnu2_tls_p (set, tls64))
+		break;
+	      continue;
 
-	  rtx_insn *def_insn;
-	  machine_mode scalar_mode;
-	  x86_cse_kind kind;
-	  rtx val = ix86_broadcast_inner (src, mode, &scalar_mode,
-					  &kind, &def_insn);
-	  if (!val)
-	    continue;
+	    case TLS64_LEA:
+	      /* Skip TLS64_LEA.  */
+	      continue;
 
-	   /* Remove redundant register loads if there are more than 2
-	      loads will be used.  */
-	  unsigned int threshold = 2;
+	    case TLS64_NONE:
+	      if (!set)
+		continue;
 
-	  /* Check if there is a matching redundant vector load.   */
-	  bool matched = false;
+	      /* Check for vector broadcast.  */
+	      if (candidate_vector_p (set))
+		break;
+	      continue;
+	    }
+
+	  /* Check if there is a matching redundant load.   */
 	  FOR_EACH_VEC_ELT (loads, i, load)
 	    if (load->val
 		&& load->kind == kind
 		&& load->mode == scalar_mode
 		&& (load->bb == bb
-		    || kind < X86_CSE_VEC_DUP
+		    || kind != X86_CSE_VEC_DUP
 		    /* Non all 0s/1s vector load must be in the same
 		       basic block if it is in a recursive call.  */
 		    || !recursive_call_p)
 		&& rtx_equal_p (load->val, val))
 	      {
-		/* Record vector instruction.  */
+		/* Record instruction.  */
 		bitmap_set_bit (load->insns, INSN_UID (insn));
 
 		/* Record the maximum vector size.  */
-		if (load->size < GET_MODE_SIZE (mode))
+		if (kind <= X86_CSE_VEC_DUP
+		    && load->size < GET_MODE_SIZE (mode))
 		  load->size = GET_MODE_SIZE (mode);
 
 		/* Record the basic block.  */
 		bitmap_set_bit (load->bbs, bb->index);
+
+		/* Increment the count.  */
 		load->count++;
+
 		matched = true;
 		break;
 	      }
@@ -3772,10 +4486,17 @@ remove_redundant_vector_load (void)
 	  if (matched)
 	    continue;
 
-	  /* We see this vector broadcast the first time.  */
-	  load = new redundant_load;
+	  /* We see this instruction the first time.  Record the
+	     redundant source value, its mode, the destination size,
+	     instruction which defines the redundant source value,
+	     instruction basic block and the instruction kind.  */
+	  load = new redundant_pattern;
 
 	  load->val = copy_rtx (val);
+	  if (tlsdesc_val)
+	    load->tlsdesc_val = copy_rtx (tlsdesc_val);
+	  else
+	    load->tlsdesc_val = nullptr;
 	  load->mode = scalar_mode;
 	  load->size = GET_MODE_SIZE (mode);
 	  load->def_insn = def_insn;
@@ -3792,49 +4513,64 @@ remove_redundant_vector_load (void)
     }
 
   bool replaced = false;
-  rtx reg, broadcast_source, broadcast_reg;
   FOR_EACH_VEC_ELT (loads, i, load)
     if (load->count >= load->threshold)
       {
-	machine_mode mode = ix86_get_vector_cse_mode (load->size,
-						      load->mode);
-	broadcast_reg = gen_reg_rtx (mode);
-	if (load->def_insn)
-	  {
-	    /* Replace redundant vector loads with a single vector load
-	       in the same basic block.  */
-	    reg = load->val;
-	    if (load->mode != GET_MODE (reg))
-	      reg = gen_rtx_SUBREG (load->mode, reg, 0);
-	    broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
-	    replace_vector_const (mode, broadcast_reg, load->insns,
-				  load->mode);
-	  }
-	else
+	machine_mode mode;
+	rtx reg, broadcast_source, broadcast_reg;
+	replaced = true;
+	switch (load->kind)
 	  {
-	    /* This is a constant integer/double vector.  If the
-	       inner scalar is 0 or -1, set vector to CONST0_RTX
-	       or CONSTM1_RTX directly.  */
-	    rtx reg;
-	    switch (load->kind)
+	  case X86_CSE_TLS_GD:
+	  case X86_CSE_TLS_LD_BASE:
+	  case X86_CSE_TLSDESC:
+	    broadcast_reg = gen_reg_rtx (load->mode);
+	    replace_tls_call (broadcast_reg, load->insns,
+			      (load->kind == X86_CSE_TLSDESC
+			       ? updated_gnu2_tls_insns
+			       : updated_gnu_tls_insns));
+	    load->broadcast_reg = broadcast_reg;
+	    break;
+
+	  case X86_CSE_CONST0_VECTOR:
+	  case X86_CSE_CONSTM1_VECTOR:
+	  case X86_CSE_VEC_DUP:
+	    mode = ix86_get_vector_cse_mode (load->size, load->mode);
+	    broadcast_reg = gen_reg_rtx (mode);
+	    if (load->def_insn)
 	      {
-	      case X86_CSE_CONST0_VECTOR:
-		broadcast_source = CONST0_RTX (mode);
-		break;
-	      case X86_CSE_CONSTM1_VECTOR:
-		broadcast_source = CONSTM1_RTX (mode);
-		break;
-	      default:
-		reg = gen_reg_rtx (load->mode);
+		/* Replace redundant vector loads with a single vector
+		   load in the same basic block.  */
+		reg = load->val;
+		if (load->mode != GET_MODE (reg))
+		  reg = gen_rtx_SUBREG (load->mode, reg, 0);
 		broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
-		break;
 	      }
+	    else
+	      /* This is a constant integer/double vector.  If the
+		 inner scalar is 0 or -1, set vector to CONST0_RTX
+		 or CONSTM1_RTX directly.  */
+	      switch (load->kind)
+		{
+		case X86_CSE_CONST0_VECTOR:
+		  broadcast_source = CONST0_RTX (mode);
+		  break;
+		case X86_CSE_CONSTM1_VECTOR:
+		  broadcast_source = CONSTM1_RTX (mode);
+		  break;
+		case X86_CSE_VEC_DUP:
+		  reg = gen_reg_rtx (load->mode);
+		  broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
 	    replace_vector_const (mode, broadcast_reg, load->insns,
 				  load->mode);
+	    load->broadcast_source = broadcast_source;
+	    load->broadcast_reg = broadcast_reg;
+	    break;
 	  }
-	load->broadcast_source = broadcast_source;
-	load->broadcast_reg = broadcast_reg;
-	replaced = true;
       }
 
   if (replaced)
@@ -3849,40 +4585,75 @@ remove_redundant_vector_load (void)
       FOR_EACH_VEC_ELT (loads, i, load)
 	if (load->count >= load->threshold)
 	  {
+	    rtx set;
 	    if (load->def_insn)
-	      {
-		/* Insert a broadcast after the original scalar
-		   definition.  */
-		rtx set = gen_rtx_SET (load->broadcast_reg,
-				       load->broadcast_source);
-		insn = emit_insn_after (set, load->def_insn);
-
-		if (cfun->can_throw_non_call_exceptions)
-		  {
-		    /* Handle REG_EH_REGION note in DEF_INSN.  */
-		    rtx note = find_reg_note (load->def_insn,
-					      REG_EH_REGION, nullptr);
-		    if (note)
-		      {
-			control_flow_insns.safe_push (load->def_insn);
-			add_reg_note (insn, REG_EH_REGION,
-				      XEXP (note, 0));
-		      }
-		  }
+	      switch (load->kind)
+		{
+		case X86_CSE_TLSDESC:
+		  ix86_place_single_tls_call (load->broadcast_reg,
+					      load->tlsdesc_val,
+					      load->kind,
+					      load->bbs,
+					      updated_gnu_tls_insns,
+					      updated_gnu2_tls_insns,
+					      PATTERN (load->def_insn));
+		  break;
+		case X86_CSE_VEC_DUP:
+		  /* Insert a broadcast after the original scalar
+		     definition.  */
+		  set = gen_rtx_SET (load->broadcast_reg,
+				     load->broadcast_source);
+		  insn = emit_insn_after (set, load->def_insn);
+
+		  if (cfun->can_throw_non_call_exceptions)
+		    {
+		      /* Handle REG_EH_REGION note in DEF_INSN.  */
+		      rtx note = find_reg_note (load->def_insn,
+						REG_EH_REGION, nullptr);
+		      if (note)
+			{
+			  control_flow_insns.safe_push (load->def_insn);
+			  add_reg_note (insn, REG_EH_REGION,
+					XEXP (note, 0));
+			}
+		    }
 
-		if (dump_file)
-		  {
-		    fprintf (dump_file, "\nAdd:\n\n");
-		    print_rtl_single (dump_file, insn);
-		    fprintf (dump_file, "\nafter:\n\n");
-		    print_rtl_single (dump_file, load->def_insn);
-		    fprintf (dump_file, "\n");
-		  }
-	      }
+		  if (dump_file)
+		    {
+		      fprintf (dump_file, "\nAdd:\n\n");
+		      print_rtl_single (dump_file, insn);
+		      fprintf (dump_file, "\nafter:\n\n");
+		      print_rtl_single (dump_file, load->def_insn);
+		      fprintf (dump_file, "\n");
+		    }
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
 	    else
-	      ix86_place_single_vector_set (load->broadcast_reg,
-					    load->broadcast_source,
-					    load->bbs, load);
+	      switch (load->kind)
+		{
+		case X86_CSE_TLS_GD:
+		case X86_CSE_TLS_LD_BASE:
+		case X86_CSE_TLSDESC:
+		  ix86_place_single_tls_call (load->broadcast_reg,
+					      (load->kind == X86_CSE_TLSDESC
+					       ? load->tlsdesc_val
+					       : load->val),
+					      load->kind,
+					      load->bbs,
+					      updated_gnu_tls_insns,
+					      updated_gnu2_tls_insns);
+		  break;
+		case X86_CSE_CONST0_VECTOR:
+		case X86_CSE_CONSTM1_VECTOR:
+		case X86_CSE_VEC_DUP:
+		  ix86_place_single_vector_set (load->broadcast_reg,
+						load->broadcast_source,
+						load->bbs,
+						load);
+		  break;
+		}
 	  }
 
       loop_optimizer_finalize ();
@@ -3912,48 +4683,12 @@ remove_redundant_vector_load (void)
   return 0;
 }
 
-namespace {
-
-const pass_data pass_data_remove_redundant_vector_load =
-{
-  RTL_PASS, /* type */
-  "rrvl", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
-  TV_MACH_DEP, /* tv_id */
-  0, /* properties_required */
-  0, /* properties_provided */
-  0, /* properties_destroyed */
-  0, /* todo_flags_start */
-  0, /* todo_flags_finish */
-};
-
-class pass_remove_redundant_vector_load : public rtl_opt_pass
-{
-public:
-  pass_remove_redundant_vector_load (gcc::context *ctxt)
-    : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
-  {}
-
-  /* opt_pass methods: */
-  bool gate (function *fun) final override
-    {
-      return (TARGET_SSE2
-	      && optimize
-	      && optimize_function_for_speed_p (fun));
-    }
-
-  unsigned int execute (function *) final override
-    {
-      return remove_redundant_vector_load ();
-    }
-}; // class pass_remove_redundant_vector_load
-
 } // anon namespace
 
 rtl_opt_pass *
-make_pass_remove_redundant_vector_load (gcc::context *ctxt)
+make_pass_x86_cse (gcc::context *ctxt)
 {
-  return new pass_remove_redundant_vector_load (ctxt);
+  return new pass_x86_cse (ctxt);
 }
 
 /* Convert legacy instructions that clobbers EFLAGS to APX_NF
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 09a35ef..abb5dd7 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1172,6 +1172,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 		   OPT_mrecip,
 		   MASK_RECIP),
 
+    IX86_ATTR_YES ("80387",
+		   OPT_m80387,
+		   MASK_80387),
+
     IX86_ATTR_IX86_YES ("general-regs-only",
 			OPT_mgeneral_regs_only,
 			OPTION_MASK_GENERAL_REGS_ONLY),
@@ -1281,6 +1285,8 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 
       else if (type == ix86_opt_yes || type == ix86_opt_no)
 	{
+	  opts_set->x_target_flags |= mask;
+
 	  if (type == ix86_opt_no)
 	    opt_set_p = !opt_set_p;
 
@@ -3556,6 +3562,10 @@ ix86_set_current_function (tree fndecl)
 	    isa = "AVX";
 	  else if (cfun->machine->func_type != TYPE_NORMAL)
 	    isa = "SSE";
+	  else if (TARGET_MMX)
+	    isa = "MMX/3Dnow";
+	  else if (TARGET_80387)
+	    isa = "80387";
 	  else
 	    isa = NULL;
 	}
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 06f0288..553b46d 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -35,6 +35,6 @@ along with GCC; see the file COPYING3.  If not see
      PR116174.  */
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
 
-  INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load);
+  INSERT_PASS_AFTER (pass_late_combine, 1, pass_x86_cse);
   INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
   INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 69bc0ee..bdb8bb9 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -290,6 +290,7 @@ extern rtx ix86_tls_module_base (void);
 extern bool ix86_gpr_tls_address_pattern_p (rtx);
 extern bool ix86_tls_address_pattern_p (rtx);
 extern rtx ix86_rewrite_tls_address (rtx);
+extern rtx ix86_tls_get_addr (void);
 
 extern void ix86_expand_vector_init (bool, rtx, rtx);
 extern void ix86_expand_vector_set (bool, rtx, rtx, int);
@@ -430,8 +431,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
   (gcc::context *);
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
-extern rtl_opt_pass *make_pass_remove_redundant_vector_load
-  (gcc::context *);
+extern rtl_opt_pass *make_pass_x86_cse (gcc::context *);
 extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
 extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
 
@@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap);
 /* In i386-expand.cc.  */
 bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
 				   HOST_WIDE_INT*);
+rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 65e04d3..471be3e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -11382,6 +11382,23 @@ ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
 
   return cost;
 }
+
+/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.  */
+
+bool
+ix86_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
+				     unsigned int align,
+				     enum by_pieces_operation op,
+				     bool speed_p)
+{
+  /* Return true when we are currently expanding memcpy/memset epilogue
+     with move_by_pieces or store_by_pieces.  */
+  if (cfun->machine->by_pieces_in_use)
+    return true;
+
+  return default_use_by_pieces_infrastructure_p (size, align, op,
+						 speed_p);
+}
 
 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
    this is used for to form addresses to local data when -fPIC is in
@@ -12439,7 +12456,7 @@ ix86_tls_index (void)
 
 static GTY(()) rtx ix86_tls_symbol;
 
-static rtx
+rtx
 ix86_tls_get_addr (void)
 {
   if (cfun->machine->call_saved_registers
@@ -22102,6 +22119,15 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 	    }
 	  /* FALLTHRU */
 	case V32QImode:
+	  if (TARGET_GFNI && constant_op1)
+	    {
+	      /* Use vgf2p8affine.  One extra load for the mask, but in a loop
+		 with enough registers it will be moved out.  So for now don't
+		 account the constant mask load.  This is not quite right
+		 for non loop vectorization.  */
+	      extra = 0;
+	      return ix86_vec_cost (mode, cost->sse_op) + extra;
+	    }
 	  if (TARGET_AVX2)
 	    /* Use vpbroadcast.  */
 	    extra = cost->sse_op;
@@ -22136,6 +22162,11 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 	    count = 9;
 	  return ix86_vec_cost (mode, cost->sse_op * count) + extra;
 
+	case V64QImode:
+	  /* Ignore the mask load for GF2P8AFFINEQB.  */
+	  extra = 0;
+	  return ix86_vec_cost (mode, cost->sse_op) + extra;
+
 	case V2DImode:
 	case V4DImode:
 	  /* V*DImode arithmetic right shift is emulated.  */
@@ -25794,15 +25825,20 @@ private:
   unsigned m_num_sse_needed[3];
   /* Number of 256-bit vector permutation.  */
   unsigned m_num_avx256_vec_perm[3];
+  /* Number of reductions for FMA/DOT_PROD_EXPR/SAD_EXPR  */
+  unsigned m_num_reduc[X86_REDUC_LAST];
+  /* Don't do unroll if m_prefer_unroll is false, default is true.  */
+  bool m_prefer_unroll;
 };
 
 ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
   : vector_costs (vinfo, costing_for_scalar),
     m_num_gpr_needed (),
     m_num_sse_needed (),
-    m_num_avx256_vec_perm ()
-{
-}
+    m_num_avx256_vec_perm (),
+    m_num_reduc (),
+    m_prefer_unroll (true)
+{}
 
 /* Implement targetm.vectorize.create_costs.  */
 
@@ -26099,6 +26135,125 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	}
     }
 
+  /* Record number of load/store/gather/scatter in vectorized body.  */
+  if (where == vect_body && !m_costing_for_scalar)
+    {
+      switch (kind)
+	{
+	  /* Emulated gather/scatter or any scalarization.  */
+	case scalar_load:
+	case scalar_stmt:
+	case scalar_store:
+	case vector_gather_load:
+	case vector_scatter_store:
+	  m_prefer_unroll = false;
+	  break;
+
+	case vector_stmt:
+	case vec_to_scalar:
+	  /* Count number of reduction FMA and "real" DOT_PROD_EXPR,
+	     unroll in the vectorizer will enable partial sum.  */
+	  if (stmt_info
+	      && vect_is_reduction (stmt_info)
+	      && stmt_info->stmt)
+	    {
+	      /* Handle __builtin_fma.  */
+	      if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA)
+		{
+		  m_num_reduc[X86_REDUC_FMA] += count;
+		  break;
+		}
+
+	      if (!is_gimple_assign (stmt_info->stmt))
+		break;
+
+	      tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
+	      machine_mode inner_mode = GET_MODE_INNER (mode);
+	      tree rhs1, rhs2;
+	      bool native_vnni_p = true;
+	      gimple* def;
+	      machine_mode mode_rhs;
+	      switch (subcode)
+		{
+		case PLUS_EXPR:
+		case MINUS_EXPR:
+		  if (!fp || !flag_associative_math
+		      || flag_fp_contract_mode != FP_CONTRACT_FAST)
+		    break;
+
+		  /* FMA condition for different modes.  */
+		  if (((inner_mode == DFmode || inner_mode == SFmode)
+		       && !TARGET_FMA && !TARGET_AVX512VL)
+		      || (inner_mode == HFmode && !TARGET_AVX512FP16)
+		      || (inner_mode == BFmode && !TARGET_AVX10_2))
+		    break;
+
+		  /* MULT_EXPR + PLUS_EXPR/MINUS_EXPR is transformed
+		     to FMA/FNMA after vectorization.  */
+		  rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
+		  rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
+		  if (subcode == PLUS_EXPR
+		      && TREE_CODE (rhs1) == SSA_NAME
+		      && (def = SSA_NAME_DEF_STMT (rhs1), true)
+		      && is_gimple_assign (def)
+		      && gimple_assign_rhs_code (def) == MULT_EXPR)
+		    m_num_reduc[X86_REDUC_FMA] += count;
+		  else if (TREE_CODE (rhs2) == SSA_NAME
+			   && (def = SSA_NAME_DEF_STMT (rhs2), true)
+			   && is_gimple_assign (def)
+			   && gimple_assign_rhs_code (def) == MULT_EXPR)
+		    m_num_reduc[X86_REDUC_FMA] += count;
+		  break;
+
+		  /* Vectorizer lane_reducing_op_p supports DOT_PROX_EXPR,
+		     WIDEN_SUM_EXPR and SAD_EXPR, x86 backend only supports
+		     SAD_EXPR (usad{v16qi,v32qi,v64qi}) and DOT_PROD_EXPR.  */
+		case DOT_PROD_EXPR:
+		  rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
+		  mode_rhs = TYPE_MODE (TREE_TYPE (rhs1));
+		  if (mode_rhs == QImode)
+		    {
+		      rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
+		      signop signop1_p = TYPE_SIGN (TREE_TYPE (rhs1));
+		      signop signop2_p = TYPE_SIGN (TREE_TYPE (rhs2));
+
+		      /* vpdpbusd.  */
+		      if (signop1_p != signop2_p)
+			native_vnni_p
+			  = (GET_MODE_SIZE (mode) == 64
+			     ? TARGET_AVX512VNNI
+			     : ((TARGET_AVX512VNNI && TARGET_AVX512VL)
+				|| TARGET_AVXVNNI));
+		      else
+			/* vpdpbssd.  */
+			native_vnni_p
+			  = (GET_MODE_SIZE (mode) == 64
+			     ? TARGET_AVX10_2
+			     : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2));
+		    }
+		  m_num_reduc[X86_REDUC_DOT_PROD] += count;
+
+		  /* Dislike to do unroll and partial sum for
+		     emulated DOT_PROD_EXPR.  */
+		  if (!native_vnni_p)
+		    m_num_reduc[X86_REDUC_DOT_PROD] += 3 * count;
+		  break;
+
+		case SAD_EXPR:
+		  m_num_reduc[X86_REDUC_SAD] += count;
+		  break;
+
+		default:
+		  break;
+		}
+	    }
+
+	default:
+	  break;
+	}
+    }
+
+
   combined_fn cfn;
   if ((kind == vector_stmt || kind == scalar_stmt)
       && stmt_info
@@ -26161,8 +26316,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 		&& (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
 					(SLP_TREE_REPRESENTATIVE (node))))
 		    != INTEGER_CST))
-	       || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
-		   == VMAT_GATHER_SCATTER)))))
+	       || mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))))))
     {
       stmt_cost = ix86_default_vector_cost (kind, mode);
       stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
@@ -26306,6 +26460,41 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
 	  && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
 	      > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
 	m_costs[vect_body] = INT_MAX;
+
+      bool any_reduc_p = false;
+      for (int i = 0; i != X86_REDUC_LAST; i++)
+	if (m_num_reduc[i])
+	  {
+	    any_reduc_p = true;
+	    break;
+	  }
+
+      if (any_reduc_p
+	  /* Not much gain for loop with gather and scatter.  */
+	  && m_prefer_unroll
+	  && !LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+	{
+	  unsigned unroll_factor
+	    = OPTION_SET_P (ix86_vect_unroll_limit)
+	    ? ix86_vect_unroll_limit
+	    : ix86_cost->vect_unroll_limit;
+
+	  if (unroll_factor > 1)
+	    {
+	      for (int i = 0 ; i != X86_REDUC_LAST; i++)
+		{
+		  if (m_num_reduc[i])
+		    {
+		      unsigned tmp = CEIL (ix86_cost->reduc_lat_mult_thr[i],
+					   m_num_reduc[i]);
+		      unroll_factor = MIN (unroll_factor, tmp);
+		    }
+		}
+
+	      m_suggested_unroll_factor  = 1 << ceil_log2 (unroll_factor);
+	    }
+	}
+
     }
 
   ix86_vect_estimate_reg_pressure ();
@@ -27189,9 +27378,9 @@ ix86_memtag_can_tag_addresses ()
   return ix86_lam_type != lam_none && TARGET_LP64;
 }
 
-/* Implement TARGET_MEMTAG_TAG_SIZE.  */
+/* Implement TARGET_MEMTAG_TAG_BITSIZE.  */
 unsigned char
-ix86_memtag_tag_size ()
+ix86_memtag_tag_bitsize ()
 {
   return IX86_HWASAN_TAG_SIZE;
 }
@@ -27762,6 +27951,10 @@ static const scoped_attribute_specs *const ix86_attribute_table[] =
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST ix86_address_cost
 
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
+  ix86_use_by_pieces_infrastructure_p
+
 #undef TARGET_OVERLAP_OP_BY_PIECES_P
 #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
 
@@ -28165,8 +28358,8 @@ ix86_libgcc_floating_mode_supported_p
 #undef TARGET_MEMTAG_UNTAGGED_POINTER
 #define TARGET_MEMTAG_UNTAGGED_POINTER ix86_memtag_untagged_pointer
 
-#undef TARGET_MEMTAG_TAG_SIZE
-#define TARGET_MEMTAG_TAG_SIZE ix86_memtag_tag_size
+#undef TARGET_MEMTAG_TAG_BITSIZE
+#define TARGET_MEMTAG_TAG_BITSIZE ix86_memtag_tag_bitsize
 
 #undef TARGET_GEN_CCMP_FIRST
 #define TARGET_GEN_CCMP_FIRST ix86_gen_ccmp_first
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 49af963..ac0ce68 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -102,6 +102,15 @@ struct stringop_algs
 #define COSTS_N_BYTES(N) ((N) * 2)
 #endif
 
+
+enum ix86_reduc_unroll_factor{
+  X86_REDUC_FMA,
+  X86_REDUC_DOT_PROD,
+  X86_REDUC_SAD,
+
+  X86_REDUC_LAST
+};
+
 /* Define the specific costs for a given cpu.  NB: hard_register is used
    by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute
    hard register move costs by register allocator.  Relative costs of
@@ -225,6 +234,13 @@ struct processor_costs {
 				   to number of instructions executed in
 				   parallel.  See also
 				   ix86_reassociation_width.  */
+  const unsigned reduc_lat_mult_thr[X86_REDUC_LAST];
+				/* Latency times throughput of
+				   FMA/DOT_PROD_EXPR/SAD_EXPR,
+				   it's used to determine unroll
+				   factor in the vectorizer.  */
+  const unsigned vect_unroll_limit;    /* Limit how much the autovectorizer
+					  may unroll a loop.  */
   struct stringop_algs *memcpy, *memset;
   const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
 					  cost model.  */
@@ -644,7 +660,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
   {"cpu_64", "%{" OPT_ARCH64 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \
   {"arch", "%{!march=*:-march=%(VALUE)}"},			   \
   {"arch_32", "%{" OPT_ARCH32 ":%{!march=*:-march=%(VALUE)}}"},	   \
-  {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"},
+  {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"},    \
+  {"tls", "%{!mtls-dialect=*:-mtls-dialect=%(VALUE)}"},
 
 /* Specs for the compiler proper */
 
@@ -2865,6 +2882,9 @@ struct GTY(()) machine_function {
      approximation.  */
   BOOL_BITFIELD tls_descriptor_call_expanded_p : 1;
 
+  /* True if TLS descriptor is called more than once.  */
+  BOOL_BITFIELD tls_descriptor_call_multiple_p : 1;
+
   /* If true, the current function has a STATIC_CHAIN is placed on the
      stack below the return address.  */
   BOOL_BITFIELD static_chain_on_stack : 1;
@@ -2934,6 +2954,9 @@ struct GTY(()) machine_function {
   /* True if this is a recursive function.  */
   BOOL_BITFIELD recursive_function : 1;
 
+  /* True if by_pieces op is currently in use.  */
+  BOOL_BITFIELD by_pieces_in_use : 1;
+
   /* The largest alignment, in bytes, of stack slot actually used.  */
   unsigned int max_used_stack_alignment;
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 6686f10..cea6c15 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -901,6 +901,10 @@
 (define_attr "avx_partial_xmm_update" "false,true"
   (const_string "false"))
 
+;; Define attribute to indicate 64-bit TLS insns.
+(define_attr "tls64" "gd,ld_base,call,combine,lea,none"
+  (const_string "none"))
+
 ;; Define attribute to classify add/sub insns that consumes carry flag (CF)
 (define_attr "use_carry" "0,1" (const_string "0"))
 
@@ -23153,6 +23157,7 @@
   return "call\t{*%p2@GOTPCREL(%%rip)|[QWORD PTR %p2@GOTPCREL[rip]]}";
 }
   [(set_attr "type" "multi")
+   (set_attr "tls64" "gd")
    (set (attr "length")
 	(symbol_ref "TARGET_X32 ? 15 : 16"))])
 
@@ -23191,7 +23196,11 @@
 	       UNSPEC_TLS_GD)
      (clobber (match_operand:P 3 "register_operand"))])]
   "TARGET_64BIT"
-  "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
+{
+  if (ix86_tls_descriptor_calls_expanded_in_cfun)
+    cfun->machine->tls_descriptor_call_multiple_p = true;
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
 
 (define_insn "*tls_local_dynamic_base_32_gnu"
   [(set (match_operand:SI 0 "register_operand" "=a")
@@ -23253,6 +23262,7 @@
   return "call\t{*%p1@GOTPCREL(%%rip)|[QWORD PTR %p1@GOTPCREL[rip]]}";
 }
   [(set_attr "type" "multi")
+   (set_attr "tls64" "ld_base")
    (set_attr "length" "12")])
 
 (define_insn "*tls_local_dynamic_base_64_largepic"
@@ -23286,7 +23296,11 @@
       (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE)
       (clobber (match_operand:P 2 "register_operand"))])]
   "TARGET_64BIT"
-  "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
+{
+  if (ix86_tls_descriptor_calls_expanded_in_cfun)
+    cfun->machine->tls_descriptor_call_multiple_p = true;
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
 
 ;; Local dynamic of a single variable is a lose.  Show combine how
 ;; to convert that back to global dynamic.
@@ -23480,6 +23494,8 @@
   "TARGET_64BIT && TARGET_GNU2_TLS"
 {
   operands[2] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0];
+  if (ix86_tls_descriptor_calls_expanded_in_cfun)
+    cfun->machine->tls_descriptor_call_multiple_p = true;
   ix86_tls_descriptor_calls_expanded_in_cfun = true;
 })
 
@@ -23491,6 +23507,7 @@
   "lea%z0\t{%E1@TLSDESC(%%rip), %0|%0, %E1@TLSDESC[rip]}"
   [(set_attr "type" "lea")
    (set_attr "mode" "<MODE>")
+   (set_attr "tls64" "lea")
    (set_attr "length" "7")
    (set_attr "length_address" "4")])
 
@@ -23504,6 +23521,7 @@
   "TARGET_64BIT && TARGET_GNU2_TLS"
   "call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}"
   [(set_attr "type" "call")
+   (set_attr "tls64" "call")
    (set_attr "length" "2")
    (set_attr "length_address" "0")])
 
@@ -23525,7 +23543,8 @@
 {
   operands[4] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0];
   emit_insn (gen_tls_dynamic_gnu2_64 (ptr_mode, operands[4], operands[1]));
-})
+}
+  [(set_attr "tls64" "combine")])
 
 (define_split
   [(match_operand 0 "tls_address_pattern")]
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c93c0b1..6bda22f 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1246,6 +1246,10 @@ munroll-only-small-loops
 Target Var(ix86_unroll_only_small_loops) Init(0) Optimization
 Enable conservative small loop unrolling.
 
+-param=ix86-vect-unroll-limit=
+Target Joined UInteger Var(ix86_vect_unroll_limit) Init(4) Param
+Limit how much the autovectorizer may unroll a loop.
+
 mlam=
 Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none)
 -mlam=[none|u48|u57] Instrument meta data position in user data pointers.
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 175798c..5dbe444 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1319,6 +1319,9 @@
   (ior (match_operand 0 "nonimmediate_operand")
        (match_test "const_vec_duplicate_p (op)")))
 
+(define_predicate "const_vec_dup_operand"
+       (match_test "const_vec_duplicate_p (op)"))
+
 ;; Return true when OP is either register operand, or any
 ;; CONST_VECTOR.
 (define_predicate "reg_or_const_vector_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ec74f93..73906b8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -326,6 +326,9 @@
 (define_mode_iterator VI1_AVX512VL
   [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")])
 
+(define_mode_iterator VI1_AVX512_3264
+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX")])
+
 ;; All vector modes
 (define_mode_iterator V
   [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
@@ -26559,9 +26562,9 @@
 
 ;; XOP packed rotate instructions
 (define_expand "rotl<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-	(rotate:VI_128
-	 (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+	(rotate:VI248_128
+	 (match_operand:VI248_128 1 "nonimmediate_operand")
 	 (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
@@ -26590,9 +26593,9 @@
 })
 
 (define_expand "rotr<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-	(rotatert:VI_128
-	 (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+	(rotatert:VI248_128
+	 (match_operand:VI248_128 1 "nonimmediate_operand")
 	 (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
@@ -26964,31 +26967,122 @@
       int i;
 
       if (<CODE> != ASHIFT)
-	{
-	  if (CONST_INT_P (operands[2]))
-	    operands[2] = GEN_INT (-INTVAL (operands[2]));
-	  else
-	    negate = true;
-	}
+       {
+	     if (CONST_INT_P (operands[2]))
+	       operands[2] = GEN_INT (-INTVAL (operands[2]));
+	     else
+	       negate = true;
+	   }
       par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
       tmp = lowpart_subreg (QImode, operands[2], SImode);
       for (i = 0; i < 16; i++)
-	XVECEXP (par, 0, i) = tmp;
+        XVECEXP (par, 0, i) = tmp;
 
       tmp = gen_reg_rtx (V16QImode);
       emit_insn (gen_vec_initv16qiqi (tmp, par));
 
       if (negate)
-	emit_insn (gen_negv16qi2 (tmp, tmp));
+        emit_insn (gen_negv16qi2 (tmp, tmp));
 
       gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
       emit_insn (gen (operands[0], operands[1], tmp));
     }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2])
+           && (<MODE_SIZE> == 64
+               || !(INTVAL (operands[2]) == 7 && <CODE> == ASHIFTRT)))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2],
+						   <CODE>);
+      emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+					    const0_rtx));
+    }
   else
     ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
   DONE;
 })
 
+(define_expand "cond_<insn><mode>"
+  [(set (match_operand:VI1_AVX512VL 0 "register_operand")
+	(vec_merge:VI1_AVX512VL
+	  (any_shift:VI1_AVX512VL
+	    (match_operand:VI1_AVX512VL 2 "register_operand")
+	    (match_operand:VI1_AVX512VL 3 "const_vec_dup_operand"))
+	  (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand")
+	(match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_GFNI && TARGET_AVX512F"
+{
+  rtx count = XVECEXP (operands[3], 0, 0);
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], count, <CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[2], matrix,
+					     const0_rtx, operands[4],
+					     operands[1]));
+  DONE;
+})
+
+(define_expand "<insn><mode>3"
+  [(set (match_operand:VI1_AVX512_3264 0 "register_operand")
+	(any_rotate:VI1_AVX512_3264
+	  (match_operand:VI1_AVX512_3264 1 "register_operand")
+	  (match_operand:SI 2 "const_int_operand")))]
+  "TARGET_GFNI"
+{
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+             const0_rtx));
+  DONE;
+})
+
+(define_expand "<insn>v16qi3"
+  [(set (match_operand:V16QI 0 "register_operand")
+     (any_rotate:V16QI
+       (match_operand:V16QI 1 "nonimmediate_operand")
+       (match_operand:SI 2 "general_operand")))]
+  "TARGET_GFNI || TARGET_XOP"
+{
+  /* Handle the V16QI XOP case to avoid a conflict with the other expand.  */
+  if (TARGET_XOP)
+    {
+      if (! const_0_to_7_operand (operands[2], SImode))
+        {
+          rtvec vs = rtvec_alloc (16);
+          rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+          rtx reg = gen_reg_rtx (V16QImode);
+          rtx op2 = operands[2];
+          int i;
+
+          if (GET_MODE (op2) != QImode)
+            {
+              op2 = gen_reg_rtx (QImode);
+              convert_move (op2, operands[2], false);
+            }
+
+          for (i = 0; i < 16; i++)
+            RTVEC_ELT (vs, i) = op2;
+
+          emit_insn (gen_vec_initv16qiqi (reg, par));
+          if (<CODE> == ROTATERT)
+            {
+              rtx neg = gen_reg_rtx (V16QImode);
+              emit_insn (gen_negv16qi2 (neg, reg));
+              emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg));
+              reg = neg;
+            }
+          emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg));
+          DONE;
+       }
+    }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2]))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+      emit_insn (gen_vgf2p8affineqb_v16qi (operands[0],
+					   force_reg (V16QImode, operands[1]),
+					   matrix, const0_rtx));
+      DONE;
+    }
+  else
+    FAIL;
+})
+
 (define_expand "ashrv2di3"
   [(set (match_operand:V2DI 0 "register_operand")
 	(ashiftrt:V2DI
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index c8603b9..1649ea2 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -141,6 +141,12 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   COSTS_N_BYTES (4),			/* cost of CVT(T)PS2PI instruction.  */
   
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   ix86_size_memcpy,
   ix86_size_memset,
   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
@@ -261,6 +267,12 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (27),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (27),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   i386_memcpy,
   i386_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -382,6 +394,12 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (27),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (27),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   i486_memcpy,
   i486_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -501,6 +519,12 @@ struct processor_costs pentium_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -613,6 +637,12 @@ struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (5),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (5),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -740,6 +770,12 @@ struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -858,6 +894,12 @@ struct processor_costs geode_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   geode_memcpy,
   geode_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -979,6 +1021,12 @@ struct processor_costs k6_cost = {
   COSTS_N_INSNS (2),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (2),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   k6_memcpy,
   k6_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1101,6 +1149,12 @@ struct processor_costs athlon_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   athlon_memcpy,
   athlon_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1232,6 +1286,12 @@ struct processor_costs k8_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (5),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   k8_memcpy,
   k8_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1371,6 +1431,12 @@ struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   amdfam10_memcpy,
   amdfam10_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -1503,6 +1569,12 @@ const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   bdver_memcpy,
   bdver_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1668,6 +1740,12 @@ struct processor_costs znver1_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {5, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver1_memcpy,
   znver1_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1836,6 +1914,12 @@ struct processor_costs znver2_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {10, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1979,6 +2063,12 @@ struct processor_costs znver3_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 6},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2125,6 +2215,12 @@ struct processor_costs znver4_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 6},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2287,6 +2383,12 @@ struct processor_costs znver5_cost = {
 	We increase width to 6 for multiplications
 	in ix86_reassociation_width.  */
   6, 6, 4, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 6},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2422,6 +2524,12 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (7),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   skylake_memcpy,
   skylake_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2559,6 +2667,12 @@ struct processor_costs icelake_cost = {
   COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 10, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   icelake_memcpy,
   icelake_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2690,6 +2804,12 @@ struct processor_costs alderlake_cost = {
   COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   alderlake_memcpy,
   alderlake_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2814,6 +2934,12 @@ const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   btver1_memcpy,
   btver1_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -2935,6 +3061,12 @@ const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   btver2_memcpy,
   btver2_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -3055,6 +3187,12 @@ struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (12),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentium4_memcpy,
   pentium4_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3178,6 +3316,12 @@ struct processor_costs nocona_cost = {
   COSTS_N_INSNS (12),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   nocona_memcpy,
   nocona_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3299,6 +3443,12 @@ struct processor_costs atom_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  2,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   atom_memcpy,
   atom_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3420,6 +3570,12 @@ struct processor_costs slm_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   slm_memcpy,
   slm_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3555,6 +3711,12 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   tremont_memcpy,
   tremont_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3681,6 +3843,12 @@ struct processor_costs lujiazui_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   lujiazui_memcpy,
   lujiazui_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3805,6 +3973,12 @@ struct processor_costs yongfeng_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   yongfeng_memcpy,
   yongfeng_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3929,6 +4103,12 @@ struct processor_costs shijidadao_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   shijidadao_memcpy,
   shijidadao_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -4078,6 +4258,12 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   generic_memcpy,
   generic_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -4215,6 +4401,12 @@ struct processor_costs core_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (7),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   core_memcpy,
   core_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
diff --git a/gcc/config/loongarch/genopts/isa-evolution.in b/gcc/config/loongarch/genopts/isa-evolution.in
index 50f72d5..836d93a 100644
--- a/gcc/config/loongarch/genopts/isa-evolution.in
+++ b/gcc/config/loongarch/genopts/isa-evolution.in
@@ -2,4 +2,5 @@
 2	26	div32		1.1		Support div.w[u] and mod.w[u] instructions with inputs not sign-extended.
 2	27	lam-bh		1.1		Support am{swap/add}[_db].{b/h} instructions.
 2	28	lamcas		1.1		Support amcas[_db].{b/h/w/d} instructions.
+2	30	scq		1.1		Support sc.q instruction.
 3	23	ld-seq-sa	1.1		Do not need load-load barriers (dbar 0x700).
diff --git a/gcc/config/loongarch/loongarch-def.cc b/gcc/config/loongarch/loongarch-def.cc
index 04b277e..dcd8d90 100644
--- a/gcc/config/loongarch/loongarch-def.cc
+++ b/gcc/config/loongarch/loongarch-def.cc
@@ -72,7 +72,7 @@ array_arch<loongarch_isa> loongarch_cpu_default_isa =
 	    .simd_ (ISA_EXT_SIMD_LASX)
 	    .evolution_ (OPTION_MASK_ISA_DIV32 | OPTION_MASK_ISA_LD_SEQ_SA
 			 | OPTION_MASK_ISA_LAM_BH | OPTION_MASK_ISA_LAMCAS
-			 | OPTION_MASK_ISA_FRECIPE))
+			 | OPTION_MASK_ISA_FRECIPE | OPTION_MASK_ISA_SCQ))
     .set (ARCH_LA64V1_0,
 	  loongarch_isa ()
 	    .base_ (ISA_BASE_LA64)
@@ -86,7 +86,7 @@ array_arch<loongarch_isa> loongarch_cpu_default_isa =
 	    .simd_ (ISA_EXT_SIMD_LSX)
 	    .evolution_ (OPTION_MASK_ISA_DIV32 | OPTION_MASK_ISA_LD_SEQ_SA
 			 | OPTION_MASK_ISA_LAM_BH | OPTION_MASK_ISA_LAMCAS
-			 | OPTION_MASK_ISA_FRECIPE));
+			 | OPTION_MASK_ISA_FRECIPE | OPTION_MASK_ISA_SCQ));
 
 
 static inline loongarch_cache la464_cache ()
diff --git a/gcc/config/loongarch/loongarch-def.h b/gcc/config/loongarch/loongarch-def.h
index 0bcd2a7..0a7d0c9 100644
--- a/gcc/config/loongarch/loongarch-def.h
+++ b/gcc/config/loongarch/loongarch-def.h
@@ -78,12 +78,10 @@ extern loongarch_def_array<const char *, N_ISA_EXT_TYPES>
 
 
 /* Base ABI */
-enum {
-  ABI_BASE_LP64D	= 0,
-  ABI_BASE_LP64F	= 1,
-  ABI_BASE_LP64S	= 2,
-  N_ABI_BASE_TYPES	= 3
-};
+#define ABI_BASE_LP64D	  0
+#define ABI_BASE_LP64F	  1
+#define ABI_BASE_LP64S	  2
+#define N_ABI_BASE_TYPES  3
 
 extern loongarch_def_array<const char *, N_ABI_BASE_TYPES>
   loongarch_abi_base_strings;
diff --git a/gcc/config/loongarch/loongarch-evolution.cc b/gcc/config/loongarch/loongarch-evolution.cc
index de68624..a92a645 100644
--- a/gcc/config/loongarch/loongarch-evolution.cc
+++ b/gcc/config/loongarch/loongarch-evolution.cc
@@ -32,6 +32,7 @@ int la_evo_feature_masks[] = {
   OPTION_MASK_ISA_DIV32,
   OPTION_MASK_ISA_LAM_BH,
   OPTION_MASK_ISA_LAMCAS,
+  OPTION_MASK_ISA_SCQ,
   OPTION_MASK_ISA_LD_SEQ_SA,
 };
 
@@ -40,6 +41,7 @@ const char* la_evo_macro_name[] = {
   "__loongarch_div32",
   "__loongarch_lam_bh",
   "__loongarch_lamcas",
+  "__loongarch_scq",
   "__loongarch_ld_seq_sa",
 };
 
@@ -48,6 +50,7 @@ int la_evo_version_major[] = {
   1,    /* DIV32 */
   1,    /* LAM_BH */
   1,    /* LAMCAS */
+  1,    /* SCQ */
   1,    /* LD_SEQ_SA */
 };
 
@@ -56,5 +59,6 @@ int la_evo_version_minor[] = {
   1,    /* DIV32 */
   1,    /* LAM_BH */
   1,    /* LAMCAS */
+  1,    /* SCQ */
   1,    /* LD_SEQ_SA */
 };
diff --git a/gcc/config/loongarch/loongarch-evolution.h b/gcc/config/loongarch/loongarch-evolution.h
index 5f90839..7fb7b0d 100644
--- a/gcc/config/loongarch/loongarch-evolution.h
+++ b/gcc/config/loongarch/loongarch-evolution.h
@@ -36,6 +36,7 @@ static constexpr struct {
   { 2, 1u << 26, OPTION_MASK_ISA_DIV32 },
   { 2, 1u << 27, OPTION_MASK_ISA_LAM_BH },
   { 2, 1u << 28, OPTION_MASK_ISA_LAMCAS },
+  { 2, 1u << 30, OPTION_MASK_ISA_SCQ },
   { 3, 1u << 23, OPTION_MASK_ISA_LD_SEQ_SA },
 };
 
@@ -58,8 +59,9 @@ enum {
   EVO_DIV32 = 1,
   EVO_LAM_BH = 2,
   EVO_LAMCAS = 3,
-  EVO_LD_SEQ_SA = 4,
-  N_EVO_FEATURES = 5
+  EVO_SCQ = 4,
+  EVO_LD_SEQ_SA = 5,
+  N_EVO_FEATURES = 6
 };
 
 /* Condition macros */
@@ -71,6 +73,8 @@ enum {
   (la_target.isa.evolution & OPTION_MASK_ISA_LAM_BH)
 #define ISA_HAS_LAMCAS \
   (la_target.isa.evolution & OPTION_MASK_ISA_LAMCAS)
+#define ISA_HAS_SCQ \
+  (la_target.isa.evolution & OPTION_MASK_ISA_SCQ)
 #define ISA_HAS_LD_SEQ_SA \
   (la_target.isa.evolution & OPTION_MASK_ISA_LD_SEQ_SA)
 
diff --git a/gcc/config/loongarch/loongarch-str.h b/gcc/config/loongarch/loongarch-str.h
index 1546ea3..583cce8 100644
--- a/gcc/config/loongarch/loongarch-str.h
+++ b/gcc/config/loongarch/loongarch-str.h
@@ -70,6 +70,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTSTR_DIV32	"div32"
 #define OPTSTR_LAM_BH	"lam-bh"
 #define OPTSTR_LAMCAS	"lamcas"
+#define OPTSTR_SCQ	"scq"
 #define OPTSTR_LD_SEQ_SA	"ld-seq-sa"
 
 #endif /* LOONGARCH_STR_H */
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 493f95e..0935d7b 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -4388,6 +4388,7 @@ loongarch_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	    break;
 	  }
       else if (TARGET_RECIP_VEC_DIV
+	       && vectype
 	       && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
 	{
 	  machine_mode mode = TYPE_MODE (vectype);
@@ -6221,9 +6222,8 @@ loongarch_print_operand_reloc (FILE *file, rtx op, bool hi64_part,
    'Q'  Print R_LARCH_RELAX for TLS IE.
    'r'  Print address 12-31bit relocation associated with OP.
    'R'  Print address 32-51bit relocation associated with OP.
-   'T'	Print 'f' for (eq:CC ...), 't' for (ne:CC ...),
-	      'z' for (eq:?I ...), 'n' for (ne:?I ...).
-   't'	Like 'T', but with the EQ/NE cases reversed
+   'T'	Print a comment marker if %G outputs nothing.
+   't'	Print the register containing the higher 64 bits of a TImode.
    'u'	Print a LASX register.
    'v'	Print the insn size suffix b, h, w or d for vector modes V16QI, V8HI,
 	  V4SI, V2SI, and w, d for vector modes V4SF, V2DF respectively.
@@ -6306,6 +6306,13 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
 	fputs ("dbar\t0x700", file);
       break;
 
+    case 'T':
+      if (!loongarch_cas_failure_memorder_needs_acquire (
+	    memmodel_from_int (INTVAL (op)))
+	  && ISA_HAS_LD_SEQ_SA)
+	fprintf (file, "%s", ASM_COMMENT_START);
+      break;
+
     case 'h':
       if (code == HIGH)
 	op = XEXP (op, 0);
@@ -6384,14 +6391,6 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
 				     false /* lo_reloc */);
       break;
 
-    case 't':
-    case 'T':
-      {
-	int truth = (code == NE) == (letter == 'T');
-	fputc ("zfnt"[truth * 2 + FCC_REG_P (REGNO (XEXP (op, 0)))], file);
-      }
-      break;
-
     case 'V':
       if (CONST_VECTOR_P (op))
 	{
@@ -6495,6 +6494,16 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
 	}
       break;
 
+    case 't':
+      if (GET_MODE (op) != TImode
+	  || (op != CONST0_RTX (TImode) && code != REG))
+	{
+	  output_operand_lossage ("invalid use of '%%%c'", letter);
+	  break;
+	}
+      op = loongarch_subword (op, 1);
+      letter = 'z';
+      /* fall through */
     default:
       switch (code)
 	{
@@ -10786,9 +10795,9 @@ loongarch_expand_vec_cmp (rtx operands[])
    to a fixed type.  */
 
 static machine_mode
-loongarch_promote_function_mode (const_tree type ATTRIBUTE_UNUSED,
+loongarch_promote_function_mode (const_tree type,
 				 machine_mode mode,
-				 int *punsignedp ATTRIBUTE_UNUSED,
+				 int *punsignedp,
 				 const_tree fntype ATTRIBUTE_UNUSED,
 				 int for_return ATTRIBUTE_UNUSED)
 {
@@ -11154,6 +11163,46 @@ loongarch_c_mode_for_suffix (char suffix)
   return VOIDmode;
 }
 
+/* Implement TARGET_C_BITINT_TYPE_INFO.
+   Return true if _BitInt(N) is supported and fill its details into *INFO.  */
+bool
+loongarch_bitint_type_info (int n, struct bitint_info *info)
+{
+  if (n <= 8)
+    info->limb_mode = QImode;
+  else if (n <= 16)
+    info->limb_mode = HImode;
+  else if (n <= 32)
+    info->limb_mode = SImode;
+  else if (n <= 64)
+    info->limb_mode = DImode;
+  else if (n <= 128)
+    info->limb_mode = TImode;
+  else
+    info->limb_mode = DImode;
+
+  info->abi_limb_mode = info->limb_mode;
+
+  if (n > 64)
+    info->abi_limb_mode = TImode;
+
+  info->big_endian = false;
+  info->extended = true;
+  return true;
+}
+
+/* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
+
+static int
+loongarch_compute_pressure_classes (reg_class *classes)
+{
+  int i = 0;
+  classes[i++] = GENERAL_REGS;
+  classes[i++] = FP_REGS;
+  classes[i++] = FCC_REGS;
+  return i;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -11428,6 +11477,12 @@ loongarch_c_mode_for_suffix (char suffix)
 #undef TARGET_C_MODE_FOR_SUFFIX
 #define TARGET_C_MODE_FOR_SUFFIX loongarch_c_mode_for_suffix
 
+#undef TARGET_C_BITINT_TYPE_INFO
+#define TARGET_C_BITINT_TYPE_INFO loongarch_bitint_type_info
+
+#undef TARGET_COMPUTE_PRESSURE_CLASSES
+#define TARGET_COMPUTE_PRESSURE_CLASSES loongarch_compute_pressure_classes
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-loongarch.h"
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index 5fc8665..e8819bf 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -270,7 +270,9 @@ along with GCC; see the file COPYING3.  If not see
   if (GET_MODE_CLASS (MODE) == MODE_INT \
       && GET_MODE_SIZE (MODE) < UNITS_PER_WORD) \
     { \
-      if ((MODE) == SImode) \
+      if ((MODE) == SImode \
+	  && !(TYPE && TREE_CODE (TYPE) == BITINT_TYPE \
+	       && TYPE_PRECISION (TYPE) < 32)) \
 	(UNSIGNEDP) = 0; \
       (MODE) = Pmode; \
     }
diff --git a/gcc/config/loongarch/loongarch.opt b/gcc/config/loongarch/loongarch.opt
index 4d85cf5..fbe61c0 100644
--- a/gcc/config/loongarch/loongarch.opt
+++ b/gcc/config/loongarch/loongarch.opt
@@ -334,6 +334,10 @@ mlamcas
 Target Mask(ISA_LAMCAS) Var(la_isa_evolution)
 Support amcas[_db].{b/h/w/d} instructions.
 
+mscq
+Target Mask(ISA_SCQ) Var(la_isa_evolution)
+Support sc.q instruction.
+
 mld-seq-sa
 Target Mask(ISA_LD_SEQ_SA) Var(la_isa_evolution)
 Do not need load-load barriers (dbar 0x700).
diff --git a/gcc/config/loongarch/loongarch.opt.urls b/gcc/config/loongarch/loongarch.opt.urls
index 5f644f6..606a211 100644
--- a/gcc/config/loongarch/loongarch.opt.urls
+++ b/gcc/config/loongarch/loongarch.opt.urls
@@ -90,6 +90,9 @@ UrlSuffix(gcc/LoongArch-Options.html#index-mlam-bh)
 mlamcas
 UrlSuffix(gcc/LoongArch-Options.html#index-mlamcas)
 
+mscq
+UrlSuffix(gcc/LoongArch-Options.html#index-mscq)
+
 mld-seq-sa
 UrlSuffix(gcc/LoongArch-Options.html#index-mld-seq-sa)
 
diff --git a/gcc/config/loongarch/simd.md b/gcc/config/loongarch/simd.md
index dd17cd1..4156b26 100644
--- a/gcc/config/loongarch/simd.md
+++ b/gcc/config/loongarch/simd.md
@@ -773,7 +773,7 @@
 	      (vec_select:<VEC_HALF>
 		(match_operand:IVEC 2 "register_operand" "f")
 		(match_operand:IVEC 4 "vect_par_cnst_even_or_odd_half")))
-	    (any_extend:<WVEC>
+	    (any_extend:<WVEC_HALF>
 	      (vec_select:<VEC_HALF>
 		(match_operand:IVEC 3 "register_operand" "f")
 		(match_dup 4))))
diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md
index fd8d732..2ee400e 100644
--- a/gcc/config/loongarch/sync.md
+++ b/gcc/config/loongarch/sync.md
@@ -21,25 +21,25 @@
 
 (define_c_enum "unspec" [
   UNSPEC_COMPARE_AND_SWAP
+  UNSPEC_COMPARE_AND_SWAP_AMCAS
   UNSPEC_COMPARE_AND_SWAP_ADD
   UNSPEC_COMPARE_AND_SWAP_SUB
-  UNSPEC_COMPARE_AND_SWAP_AND
-  UNSPEC_COMPARE_AND_SWAP_XOR
-  UNSPEC_COMPARE_AND_SWAP_OR
   UNSPEC_COMPARE_AND_SWAP_NAND
   UNSPEC_SYNC_OLD_OP
   UNSPEC_SYNC_EXCHANGE
   UNSPEC_ATOMIC_STORE
   UNSPEC_ATOMIC_LOAD
   UNSPEC_MEMORY_BARRIER
+
+  UNSPEC_TI_FETCH_ADD
+  UNSPEC_TI_FETCH_SUB
+  UNSPEC_TI_FETCH_AND
+  UNSPEC_TI_FETCH_XOR
+  UNSPEC_TI_FETCH_OR
+  UNSPEC_TI_FETCH_NAND_MASK_INVERTED
 ])
 
 (define_code_iterator any_atomic [plus ior xor and])
-(define_code_attr atomic_optab
-  [(plus "add") (ior "or") (xor "xor") (and "and")])
-
-;; This attribute gives the format suffix for atomic memory operations.
-(define_mode_attr amo [(QI "b") (HI "h") (SI "w") (DI "d")])
 
 ;; <amop> expands to the name of the atomic operand that implements a
 ;; particular code.
@@ -107,7 +107,7 @@
 (define_insn "atomic_load<mode>"
   [(set (match_operand:QHWD 0 "register_operand" "=r")
     (unspec_volatile:QHWD
-      [(match_operand:QHWD 1 "memory_operand" "+m")
+      [(match_operand:QHWD 1 "memory_operand" "m")
        (match_operand:SI 2 "const_int_operand")]                        ;; model
       UNSPEC_ATOMIC_LOAD))]
   ""
@@ -142,9 +142,50 @@
 }
   [(set (attr "length") (const_int 12))])
 
+(define_insn "atomic_loadti_lsx"
+  [(set (match_operand:V2DI 0 "register_operand" "=f")
+	(unspec_volatile:V2DI
+          [(match_operand:TI 1 "memory_operand" "m")
+	   (match_operand:SI 2 "const_int_operand")] ;; model
+	  UNSPEC_ATOMIC_LOAD))]
+  "ISA_HAS_LSX && TARGET_64BIT"
+{
+  enum memmodel model = memmodel_base (INTVAL (operands[2]));
+
+  switch (model)
+    {
+    case MEMMODEL_SEQ_CST:
+      output_asm_insn ("dbar\t0x11", operands);
+      /* fall through */
+    case MEMMODEL_ACQUIRE:
+    case MEMMODEL_RELAXED:
+      return "vld\t%w0,%1\\n\\t%G2";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "length") (const_int 12))])
+
+(define_expand "atomic_loadti"
+  [(match_operand:TI 0 "register_operand" "=r")
+   (match_operand:TI 1 "memory_operand"   "m")
+   (match_operand:SI 2 "const_int_operand")]
+  "ISA_HAS_LSX && TARGET_64BIT"
+{
+  rtx vr = gen_reg_rtx (V2DImode);
+
+  emit_insn (gen_atomic_loadti_lsx (vr, operands[1], operands[2]));
+  for (int i = 0; i < 2; i++)
+    emit_insn (
+      gen_lsx_vpickve2gr_d (loongarch_subword (operands[0], i), vr,
+			    GEN_INT (i)));
+  DONE;
+})
+
 ;; Implement atomic stores with amoswap.  Fall back to fences for atomic loads.
 (define_insn "atomic_store<mode>"
-  [(set (match_operand:QHWD 0 "memory_operand" "+m")
+  [(set (match_operand:QHWD 0 "memory_operand" "=m")
     (unspec_volatile:QHWD
       [(match_operand:QHWD 1 "reg_or_0_operand" "rJ")
        (match_operand:SI 2 "const_int_operand")]      ;; model
@@ -175,7 +216,67 @@
 }
   [(set (attr "length") (const_int 12))])
 
-(define_insn "atomic_<atomic_optab><mode>"
+(define_insn "atomic_storeti_lsx"
+  [(set (match_operand:TI 0 "memory_operand" "=m")
+	(unspec_volatile:TI
+	  [(match_operand:V2DI 1 "register_operand" "f")
+	   (match_operand:SI   2 "const_int_operand")] ;; model
+	UNSPEC_ATOMIC_STORE))]
+  "ISA_HAS_LSX && TARGET_64BIT"
+{
+  enum memmodel model = memmodel_base (INTVAL (operands[2]));
+
+  switch (model)
+    {
+    case MEMMODEL_SEQ_CST:
+      return "dbar\t0x12\\n\\t"
+	     "vst\t%w1,%0\\n\\t"
+	     "dbar\t0x18";
+    case MEMMODEL_RELEASE:
+      return "dbar\t0x12\\n\\t"
+	     "vst\t%w1,%0";
+    case MEMMODEL_RELAXED:
+      return "vst\t%w1,%0";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "length") (const_int 12))])
+
+(define_insn "atomic_storeti_scq"
+  [(set (match_operand:TI 0 "memory_operand" "=m")
+	(unspec_volatile:TI
+	  [(match_operand:TI 1 "register_operand" "r")]
+	  UNSPEC_ATOMIC_STORE))
+   (clobber (match_scratch:DI 2 "=&r"))]
+  "TARGET_64BIT && ISA_HAS_SCQ"
+  "1:\\n\\tll.d\t$r0,%0\n\tmove\t%2,%1\n\tsc.q\t%2,%t1,%0\n\tbeqz\t%2,1b"
+  [(set (attr "length") (const_int 16))])
+
+(define_expand "atomic_storeti"
+  [(match_operand:TI 0 "memory_operand"   "=m")
+   (match_operand:TI 1 "reg_or_0_operand" "rJ")
+   (match_operand:SI 2 "const_int_operand")]
+  "TARGET_64BIT && (ISA_HAS_LSX || ISA_HAS_SCQ)"
+{
+  if (!ISA_HAS_LSX)
+    {
+      emit_insn (gen_atomic_storeti_scq (operands[0], operands[1]));
+      DONE;
+    }
+
+  rtx vr = gen_reg_rtx (V2DImode), op1 = operands[1];
+  rtvec v = rtvec_alloc (2);
+
+  for (int i = 0; i < 2; i++)
+    RTVEC_ELT (v, i) = loongarch_subword (op1, i);
+
+  emit_insn (gen_vec_initv2didi (vr, gen_rtx_PARALLEL (V2DImode, v)));
+  emit_insn (gen_atomic_storeti_lsx (operands[0], vr, operands[2]));
+  DONE;
+})
+
+(define_insn "atomic_<amop><mode>"
   [(set (match_operand:GPR 0 "memory_operand" "+ZB")
 	(unspec_volatile:GPR
 	  [(any_atomic:GPR (match_dup 0)
@@ -183,7 +284,7 @@
 	   (match_operand:SI 2 "const_int_operand")] ;; model
 	 UNSPEC_SYNC_OLD_OP))]
   ""
-  "am<amop>%A2.<amo>\t$zero,%z1,%0"
+  "am<amop>%A2.<size>\t$zero,%z1,%0"
   [(set (attr "length") (const_int 4))])
 
 (define_insn "atomic_add<mode>"
@@ -194,10 +295,10 @@
 	   (match_operand:SI 2 "const_int_operand")] ;; model
 	 UNSPEC_SYNC_OLD_OP))]
   "ISA_HAS_LAM_BH"
-  "amadd%A2.<amo>\t$zero,%z1,%0"
+  "amadd%A2.<size>\t$zero,%z1,%0"
   [(set (attr "length") (const_int 4))])
 
-(define_insn "atomic_fetch_<atomic_optab><mode>"
+(define_insn "atomic_fetch_<amop><mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")
 	(match_operand:GPR 1 "memory_operand" "+ZB"))
    (set (match_dup 1)
@@ -207,9 +308,52 @@
 	   (match_operand:SI 3 "const_int_operand")] ;; model
 	 UNSPEC_SYNC_OLD_OP))]
   ""
-  "am<amop>%A3.<amo>\t%0,%z2,%1"
+  "am<amop>%A3.<size>\t%0,%z2,%1"
   [(set (attr "length") (const_int 4))])
 
+(define_insn "atomic_fetch_nand_mask_inverted<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR
+	  [(ior:GPR (not (match_dup 1))
+		    (match_operand:GPR 2 "register_operand" "r"))]
+	  UNSPEC_SYNC_OLD_OP))
+   (clobber (match_scratch:GPR 3 "=&r"))]
+  ""
+  {
+    return "1:\\n\\t"
+	   "ll.<d>\\t%0,%1\\n\\t"
+	   "orn\\t%3,%2,%0\\n\\t"
+	   "sc.<d>\\t%3,%1\\n\\t"
+	   "beqz\\t%3,1b";
+  }
+  [(set (attr "length") (const_int 16))])
+
+(define_mode_iterator ALL_SC [GPR (TI "TARGET_64BIT && ISA_HAS_SCQ")])
+(define_mode_attr _scq [(SI "") (DI "") (TI "_scq")])
+(define_expand "atomic_fetch_nand<mode>"
+  [(match_operand:ALL_SC 0 "register_operand")
+   (match_operand:ALL_SC 1 "memory_operand")
+   (match_operand:ALL_SC 2 "reg_or_0_operand")
+   (match_operand:SI     3 "const_int_operand")]
+  ""
+  {
+    /* ~(atom & mask) = (~mask) | (~atom), so we can hoist
+       (~mask) out of the ll/sc loop and use the orn instruction in the
+       ll/sc loop.  */
+    rtx inverted_mask = gen_reg_rtx (<MODE>mode);
+    emit_move_insn (inverted_mask,
+		    expand_simple_unop (<MODE>mode, NOT, operands[2],
+					NULL_RTX, false));
+
+    emit_insn (
+      gen_atomic_fetch_nand_mask_inverted<mode><_scq> (operands[0],
+						       operands[1],
+						       inverted_mask));
+    DONE;
+  })
+
 (define_insn "atomic_exchange<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")
 	(unspec_volatile:GPR
@@ -219,9 +363,44 @@
    (set (match_dup 1)
 	(match_operand:GPR 2 "register_operand" "r"))]
   ""
-  "amswap%A3.<amo>\t%0,%z2,%1"
+  "amswap%A3.<size>\t%0,%z2,%1"
   [(set (attr "length") (const_int 4))])
 
+(define_insn "atomic_exchangeti_scq"
+  [(set (match_operand:TI 0 "register_operand" "=&r")
+	(unspec_volatile:TI
+	  [(match_operand:TI 1 "memory_operand" "+ZB")]
+	  UNSPEC_SYNC_EXCHANGE))
+   (set (match_dup 1)
+	(match_operand:TI 2 "register_operand" "rJ"))
+   (clobber (match_scratch:DI 3 "=&r"))]
+  "TARGET_64BIT && ISA_HAS_SCQ"
+{
+  output_asm_insn ("1:", operands);
+  output_asm_insn ("ll.d\t%0,%1", operands);
+  if (!ISA_HAS_LD_SEQ_SA)
+    output_asm_insn ("dbar\t0x700", operands);
+  output_asm_insn ("ld.d\t%t0,%b1,8", operands);
+  output_asm_insn ("move\t%3,%z2", operands);
+  output_asm_insn ("sc.q\t%3,%t2,%1", operands);
+  output_asm_insn ("beqz\t%3,1b", operands);
+
+  return "";
+}
+  [(set (attr "length") (const_int 24))])
+
+(define_expand "atomic_exchangeti"
+  [(match_operand:TI 0 "register_operand" "=&r")
+   (match_operand:TI 1 "memory_operand"   "+ZB")
+   (match_operand:TI 2 "register_operand" "rJ")
+   (match_operand:SI 3 "const_int_operand")] ;; model
+  "TARGET_64BIT && ISA_HAS_SCQ"
+{
+  emit_insn (gen_atomic_exchangeti_scq (operands[0], operands[1],
+					operands[2]));
+  DONE;
+})
+
 (define_insn "atomic_exchange<mode>_short"
   [(set (match_operand:SHORT 0 "register_operand" "=&r")
 	(unspec_volatile:SHORT
@@ -231,7 +410,7 @@
    (set (match_dup 1)
 	(match_operand:SHORT 2 "register_operand" "r"))]
   "ISA_HAS_LAM_BH"
-  "amswap%A3.<amo>\t%0,%z2,%1"
+  "amswap%A3.<size>\t%0,%z2,%1"
   [(set (attr "length") (const_int 4))])
 
 (define_insn "atomic_cas_value_strong<mode>"
@@ -240,13 +419,13 @@
    (set (match_dup 1)
 	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")
 			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")
-			      (match_operand:SI 4 "const_int_operand")]  ;; mod_s
+			      (match_operand:SI 4 "const_int_operand")]  ;; mod_f
 	 UNSPEC_COMPARE_AND_SWAP))
    (clobber (match_scratch:GPR 5 "=&r"))]
   ""
 {
   output_asm_insn ("1:", operands);
-  output_asm_insn ("ll.<amo>\t%0,%1", operands);
+  output_asm_insn ("ll.<size>\t%0,%1", operands);
 
   /* Like the test case atomic-cas-int.C, in loongarch64, O1 and higher, the
      return value of the val_without_const_folding will not be truncated and
@@ -266,9 +445,9 @@
     output_asm_insn ("bne\t%0,%z2,2f", operands);
 
   output_asm_insn ("or%i3\t%5,$zero,%3", operands);
-  output_asm_insn ("sc.<amo>\t%5,%1", operands);
+  output_asm_insn ("sc.<size>\t%5,%1", operands);
   output_asm_insn ("beqz\t%5,1b", operands);
-  output_asm_insn ("b\t3f", operands);
+  output_asm_insn ("%T4b\t3f", operands);
   output_asm_insn ("2:", operands);
   output_asm_insn ("%G4", operands);
   output_asm_insn ("3:", operands);
@@ -288,10 +467,10 @@
    (set (match_dup 1)
 	(unspec_volatile:QHWD [(match_operand:QHWD 2 "reg_or_0_operand" "rJ")
 			       (match_operand:QHWD 3 "reg_or_0_operand" "rJ")
-			       (match_operand:SI 4 "const_int_operand")]  ;; mod_s
-	 UNSPEC_COMPARE_AND_SWAP))]
+			       (match_operand:SI 4 "const_int_operand")]  ;; mod
+	 UNSPEC_COMPARE_AND_SWAP_AMCAS))]
   "ISA_HAS_LAMCAS"
-  "ori\t%0,%z2,0\n\tamcas%A4.<amo>\t%0,%z3,%1"
+  "ori\t%0,%z2,0\n\tamcas%A4.<size>\t%0,%z3,%1"
   [(set (attr "length") (const_int 8))])
 
 (define_expand "atomic_compare_and_swap<mode>"
@@ -318,16 +497,14 @@
       && is_mm_release (memmodel_base (INTVAL (mod_s))))
     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
 
-  operands[6] = mod_s;
-
   if (ISA_HAS_LAMCAS)
     emit_insn (gen_atomic_cas_value_strong<mode>_amcas (operands[1], operands[2],
 							 operands[3], operands[4],
-							 operands[6]));
+							 mod_s));
   else
     emit_insn (gen_atomic_cas_value_strong<mode> (operands[1], operands[2],
 						  operands[3], operands[4],
-						  operands[6]));
+						  mod_f));
 
   rtx compare = operands[1];
   if (operands[3] != const0_rtx)
@@ -349,49 +526,74 @@
   DONE;
 })
 
-(define_expand "atomic_test_and_set"
-  [(match_operand:QI 0 "register_operand" "")     ;; bool output
-   (match_operand:QI 1 "memory_operand" "+ZB")    ;; memory
-   (match_operand:SI 2 "const_int_operand" "")]   ;; model
+(define_expand "atomic_fetch_<amop><mode>"
+  [(match_operand:SHORT 0 "register_operand" "")		 ;; output
+   (any_bitwise (match_operand:SHORT 1 "memory_operand"   "+ZB") ;; memory
+		(match_operand:SHORT 2 "reg_or_0_operand" "rJ")) ;; val
+   (match_operand:SI 3 "const_int_operand" "")]			 ;; model
   ""
 {
-  /* We have no QImode atomics, so use the address LSBs to form a mask,
-     then use an aligned SImode atomic.  */
+  /* We have no QI/HImode bitwise atomics, so use the address LSBs to form
+     a mask, then use an aligned SImode atomic.  */
   rtx result = operands[0];
   rtx mem = operands[1];
-  rtx model = operands[2];
+  rtx model = operands[3];
   rtx addr = force_reg (Pmode, XEXP (mem, 0));
-  rtx tmp_reg = gen_reg_rtx (Pmode);
-  rtx zero_reg = gen_rtx_REG (Pmode, 0);
-
+  rtx mask = gen_int_mode (-4, Pmode);
   rtx aligned_addr = gen_reg_rtx (Pmode);
-  emit_move_insn (tmp_reg, gen_rtx_PLUS (Pmode, zero_reg, GEN_INT (-4)));
-  emit_move_insn (aligned_addr, gen_rtx_AND (Pmode, addr, tmp_reg));
+
+  if (!and_operand (mask, Pmode))
+    mask = force_reg (Pmode, mask);
+
+  emit_move_insn (aligned_addr, gen_rtx_AND (Pmode, addr, mask));
 
   rtx aligned_mem = change_address (mem, SImode, aligned_addr);
   set_mem_alias_set (aligned_mem, 0);
 
-  rtx offset = gen_reg_rtx (SImode);
-  emit_move_insn (offset, gen_rtx_AND (SImode, gen_lowpart (SImode, addr),
-				       GEN_INT (3)));
-
   rtx tmp = gen_reg_rtx (SImode);
-  emit_move_insn (tmp, GEN_INT (1));
+  emit_move_insn (tmp, simplify_gen_unary (ZERO_EXTEND, SImode,
+					   operands[2], <MODE>mode));
 
+  /* Note that we have defined SHIFT_COUNT_TRUNCATED to 1, so we don't need
+     to mask addr with 0b11 here.  */
   rtx shmt = gen_reg_rtx (SImode);
-  emit_move_insn (shmt, gen_rtx_ASHIFT (SImode, offset, GEN_INT (3)));
+  emit_move_insn (shmt, gen_rtx_ASHIFT (SImode, gen_lowpart (SImode, addr),
+					GEN_INT (3)));
 
   rtx word = gen_reg_rtx (SImode);
   emit_move_insn (word, gen_rtx_ASHIFT (SImode, tmp, shmt));
 
+  if (<is_and>)
+    {
+      /* word = word | ~(mode_mask << shmt) */
+      rtx tmp = force_reg (SImode,
+			   gen_int_mode (GET_MODE_MASK (<MODE>mode),
+					 SImode));
+      emit_move_insn (tmp, gen_rtx_ASHIFT (SImode, tmp, shmt));
+      emit_move_insn (word, gen_rtx_IOR (SImode, gen_rtx_NOT (SImode, tmp),
+					 word));
+    }
+
   tmp = gen_reg_rtx (SImode);
-  emit_insn (gen_atomic_fetch_orsi (tmp, aligned_mem, word, model));
+  emit_insn (gen_atomic_fetch_<amop>si (tmp, aligned_mem, word, model));
 
   emit_move_insn (gen_lowpart (SImode, result),
 		  gen_rtx_LSHIFTRT (SImode, tmp, shmt));
   DONE;
 })
 
+(define_expand "atomic_test_and_set"
+  [(match_operand:QI 0 "register_operand" "")     ;; bool output
+   (match_operand:QI 1 "memory_operand" "+ZB")    ;; memory
+   (match_operand:SI 2 "const_int_operand" "")]   ;; model
+  ""
+{
+  rtx one = force_reg (QImode, gen_int_mode (1, QImode));
+  emit_insn (gen_atomic_fetch_orqi (operands[0], operands[1], one,
+				    operands[2]));
+  DONE;
+})
+
 (define_insn "atomic_cas_value_cmp_and_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")
 	(match_operand:GPR 1 "memory_operand" "+ZC"))
@@ -400,20 +602,20 @@
 			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")
 			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")
 			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")
-			      (match_operand:SI 6 "const_int_operand")] ;; model
+			      (match_operand:SI 6 "const_int_operand")] ;; mod_f
 	 UNSPEC_COMPARE_AND_SWAP))
    (clobber (match_scratch:GPR 7 "=&r"))]
   ""
 {
   return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "ll.<size>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%2\\n\\t"
 	 "bne\\t%7,%z4,2f\\n\\t"
 	 "and\\t%7,%0,%z3\\n\\t"
 	 "or%i5\\t%7,%7,%5\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "sc.<size>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b\\n\\t"
-	 "b\\t3f\\n\\t"
+	 "%T6b\\t3f\\n\\t"
 	 "2:\\n\\t"
 	 "%G6\\n\\t"
 	 "3:\\n\\t";
@@ -444,18 +646,16 @@
       && is_mm_release (memmodel_base (INTVAL (mod_s))))
     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
 
-  operands[6] = mod_s;
-
   if (ISA_HAS_LAMCAS)
     emit_insn (gen_atomic_cas_value_strong<mode>_amcas (operands[1], operands[2],
 						       operands[3], operands[4],
-						       operands[6]));
+						       mod_s));
   else
     {
       union loongarch_gen_fn_ptrs generator;
       generator.fn_7 = gen_atomic_cas_value_cmp_and_7_si;
       loongarch_expand_atomic_qihi (generator, operands[1], operands[2],
-				    operands[3], operands[4], operands[6]);
+				    operands[3], operands[4], mod_f);
     }
 
       rtx compare = operands[1];
@@ -481,83 +681,96 @@
   DONE;
 })
 
-(define_insn "atomic_cas_value_add_7_<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
-	(match_operand:GPR 1 "memory_operand" "+ZC"))
+(define_insn "atomic_compare_and_swapti_scq"
+  [(set (match_operand:TI 0 "register_operand" "=&r")
+	(match_operand:TI 1 "memory_operand"   "+ZB"))
    (set (match_dup 1)
-	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
-			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
-			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
-			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
-			      (match_operand:SI 6 "const_int_operand")]		;; model
-	 UNSPEC_COMPARE_AND_SWAP_ADD))
-   (clobber (match_scratch:GPR 7 "=&r"))
-   (clobber (match_scratch:GPR 8 "=&r"))]
-  ""
+	(unspec_volatile:TI [(match_operand:TI 2 "reg_or_0_operand" "rJ")
+			     (match_operand:TI 3 "reg_or_0_operand" "rJ")
+			     (match_operand:SI 4 "const_int_operand")]  ;; mod_f
+	 UNSPEC_COMPARE_AND_SWAP))
+   (clobber (match_scratch:DI 5 "=&r"))]
+  "TARGET_64BIT && ISA_HAS_SCQ"
 {
-  return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
-	 "and\\t%7,%0,%3\\n\\t"
-	 "add.w\\t%8,%0,%z5\\n\\t"
-	 "and\\t%8,%8,%z2\\n\\t"
-	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
-	 "beq\\t$zero,%7,1b";
-}
+  output_asm_insn ("1:", operands);
+  output_asm_insn ("ll.d\t%0,%1", operands);
 
-  [(set (attr "length") (const_int 28))])
+  /* Compare the low word */
+  output_asm_insn ("bne\t%0,%z2,2f", operands);
 
-(define_insn "atomic_cas_value_sub_7_<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
-	(match_operand:GPR 1 "memory_operand" "+ZC"))
-   (set (match_dup 1)
-	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
-			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
-			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
-			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
-			      (match_operand:SI 6 "const_int_operand")]		;; model
-	 UNSPEC_COMPARE_AND_SWAP_SUB))
-   (clobber (match_scratch:GPR 7 "=&r"))
-   (clobber (match_scratch:GPR 8 "=&r"))]
-  ""
-{
-  return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
-	 "and\\t%7,%0,%3\\n\\t"
-	 "sub.w\\t%8,%0,%z5\\n\\t"
-	 "and\\t%8,%8,%z2\\n\\t"
-	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
-	 "beq\\t$zero,%7,1b";
+  /* Don't reorder the load of high word before ll.d.  As the TImode
+     must be aligned in the memory, the high and low words must be in
+     the same cacheline, thus dbar 0x700 is enough.  */
+  if (!ISA_HAS_LD_SEQ_SA)
+    output_asm_insn ("dbar\t0x700", operands);
+
+  /* Now load the high word.  As the high and low words are in the same
+     cacheline, in case another core has clobbered the high word before the
+     sc.q instruction is executed, the LL bit for the low word will be
+     cleared.  Thus a normal load is sufficient.  */
+  output_asm_insn ("ld.d\t%t0,%b1,8", operands);
+
+  /* Compare the high word.  */
+  output_asm_insn ("bne\t%t0,%t2,2f", operands);
+
+  /* Copy the low word of the new value as it'll be clobbered by sc.q.  */
+  output_asm_insn ("move\t%5,%z3", operands);
+
+  /* Store both words if LL bit is still set.  */
+  output_asm_insn ("sc.q\t%5,%t3,%1", operands);
+
+  /* Check if sc.q has done the store.  */
+  output_asm_insn ("beqz\t%5,1b", operands);
+
+  /* Jump over the mod_f barrier if sc.q has succeeded.  */
+  output_asm_insn ("%T4b\t3f", operands);
+
+  /* The barrier for mod_f.  */
+  output_asm_insn ("2:", operands);
+  output_asm_insn ("%G4", operands);
+
+  output_asm_insn ("3:", operands);
+  return "";
 }
-  [(set (attr "length") (const_int 28))])
+  [(set_attr "length" "40")])
 
-(define_insn "atomic_cas_value_and_7_<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
-	(match_operand:GPR 1 "memory_operand" "+ZC"))
-   (set (match_dup 1)
-	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
-			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
-			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
-			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
-			      (match_operand:SI 6 "const_int_operand")]		;; model
-	 UNSPEC_COMPARE_AND_SWAP_AND))
-   (clobber (match_scratch:GPR 7 "=&r"))
-   (clobber (match_scratch:GPR 8 "=&r"))]
-  ""
+(define_expand "atomic_compare_and_swapti"
+  [(match_operand:SI 0 "register_operand" "")   ;; bool output
+   (match_operand:TI 1 "register_operand" "")  ;; val output
+   (match_operand:TI 2 "memory_operand" "")    ;; memory
+   (match_operand:TI 3 "reg_or_0_operand" "")  ;; expected value
+   (match_operand:TI 4 "reg_or_0_operand" "")  ;; desired value
+   (match_operand:SI 5 "const_int_operand" "")  ;; is_weak
+   (match_operand:SI 6 "const_int_operand" "")  ;; mod_s
+   (match_operand:SI 7 "const_int_operand" "")] ;; mod_f
+  "TARGET_64BIT && ISA_HAS_SCQ"
 {
-  return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
-	 "and\\t%7,%0,%3\\n\\t"
-	 "and\\t%8,%0,%z5\\n\\t"
-	 "and\\t%8,%8,%z2\\n\\t"
-	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
-	 "beq\\t$zero,%7,1b";
-}
-  [(set (attr "length") (const_int 28))])
+  emit_insn (gen_atomic_compare_and_swapti_scq (operands[1], operands[2],
+						operands[3], operands[4],
+						operands[7]));
+
+  rtx t[2];
 
-(define_insn "atomic_cas_value_xor_7_<mode>"
+  for (int i = 0; i < 2; i++)
+    {
+      rtx compare = loongarch_subword (operands[1], i);
+      rtx expect = loongarch_subword (operands[3], i);
+
+      t[i] = gen_reg_rtx (DImode);
+
+      if (expect != const0_rtx)
+	emit_insn (gen_xordi3 (t[i], compare, expect));
+      else
+	emit_move_insn (t[i], compare);
+    }
+
+  emit_insn (gen_iordi3 (t[0], t[0], t[1]));
+  emit_insn (gen_rtx_SET (operands[0],
+			  gen_rtx_EQ (SImode, t[0], const0_rtx)));
+  DONE;
+})
+
+(define_insn "atomic_cas_value_add_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
 	(match_operand:GPR 1 "memory_operand" "+ZC"))
    (set (match_dup 1)
@@ -566,24 +779,24 @@
 			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
 			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
 			      (match_operand:SI 6 "const_int_operand")]		;; model
-	 UNSPEC_COMPARE_AND_SWAP_XOR))
+	 UNSPEC_COMPARE_AND_SWAP_ADD))
    (clobber (match_scratch:GPR 7 "=&r"))
    (clobber (match_scratch:GPR 8 "=&r"))]
   ""
 {
   return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "ll.<size>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
-	 "xor\\t%8,%0,%z5\\n\\t"
+	 "add.w\\t%8,%0,%z5\\n\\t"
 	 "and\\t%8,%8,%z2\\n\\t"
 	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "sc.<size>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b";
 }
 
   [(set (attr "length") (const_int 28))])
 
-(define_insn "atomic_cas_value_or_7_<mode>"
+(define_insn "atomic_cas_value_sub_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
 	(match_operand:GPR 1 "memory_operand" "+ZC"))
    (set (match_dup 1)
@@ -592,21 +805,20 @@
 			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
 			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
 			      (match_operand:SI 6 "const_int_operand")]		;; model
-	 UNSPEC_COMPARE_AND_SWAP_OR))
+	 UNSPEC_COMPARE_AND_SWAP_SUB))
    (clobber (match_scratch:GPR 7 "=&r"))
    (clobber (match_scratch:GPR 8 "=&r"))]
   ""
 {
   return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "ll.<size>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
-	 "or\\t%8,%0,%z5\\n\\t"
+	 "sub.w\\t%8,%0,%z5\\n\\t"
 	 "and\\t%8,%8,%z2\\n\\t"
 	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "sc.<size>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b";
 }
-
   [(set (attr "length") (const_int 28))])
 
 (define_insn "atomic_cas_value_nand_7_<mode>"
@@ -624,12 +836,12 @@
   ""
 {
   return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "ll.<size>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
 	 "and\\t%8,%0,%z5\\n\\t"
 	 "xor\\t%8,%8,%z2\\n\\t"
 	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "sc.<size>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b";
 }
   [(set (attr "length") (const_int 28))])
@@ -648,10 +860,10 @@
   ""
 {
   return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "ll.<size>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%z3\\n\\t"
 	 "or%i5\\t%7,%7,%5\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "sc.<size>\\t%7,%1\\n\\t"
 	 "beqz\\t%7,1b\\n\\t";
 }
   [(set (attr "length") (const_int 20))])
@@ -678,6 +890,101 @@
   DONE;
 })
 
+(define_int_iterator UNSPEC_TI_FETCH_DIRECT
+  [UNSPEC_TI_FETCH_ADD
+   UNSPEC_TI_FETCH_SUB
+   UNSPEC_TI_FETCH_AND
+   UNSPEC_TI_FETCH_XOR
+   UNSPEC_TI_FETCH_OR])
+(define_int_iterator UNSPEC_TI_FETCH
+  [UNSPEC_TI_FETCH_DIRECT UNSPEC_TI_FETCH_NAND_MASK_INVERTED])
+(define_int_attr amop_ti_fetch
+  [(UNSPEC_TI_FETCH_ADD "add")
+   (UNSPEC_TI_FETCH_SUB "sub")
+   (UNSPEC_TI_FETCH_AND "and")
+   (UNSPEC_TI_FETCH_XOR "xor")
+   (UNSPEC_TI_FETCH_OR "or")
+   (UNSPEC_TI_FETCH_NAND_MASK_INVERTED "nand_mask_inverted")])
+(define_int_attr size_ti_fetch
+  [(UNSPEC_TI_FETCH_ADD "36")
+   (UNSPEC_TI_FETCH_SUB "36")
+   (UNSPEC_TI_FETCH_AND "28")
+   (UNSPEC_TI_FETCH_XOR "28")
+   (UNSPEC_TI_FETCH_OR "28")
+   (UNSPEC_TI_FETCH_NAND_MASK_INVERTED "28")])
+
+(define_insn "atomic_fetch_<amop_ti_fetch>ti_scq"
+  [(set (match_operand:TI 0 "register_operand" "=&r")
+        (match_operand:TI 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:TI
+	  [(match_dup 0)
+	   (match_operand:TI 2 "reg_or_0_operand" "rJ")]
+	  UNSPEC_TI_FETCH))
+   (clobber (match_scratch:DI 3 "=&r"))
+   (clobber (match_scratch:DI 4 "=&r"))]
+  "TARGET_64BIT && ISA_HAS_SCQ"
+{
+  output_asm_insn ("1:", operands);
+  output_asm_insn ("ll.d\t%0,%1", operands);
+  if (!ISA_HAS_LD_SEQ_SA)
+    output_asm_insn ("dbar\t0x700", operands);
+  output_asm_insn ("ld.d\t%t0,%b1,8", operands);
+
+  switch (<UNSPEC_TI_FETCH>)
+    {
+    case UNSPEC_TI_FETCH_AND:
+    case UNSPEC_TI_FETCH_OR:
+    case UNSPEC_TI_FETCH_XOR:
+      output_asm_insn ("<amop_ti_fetch>\t%3,%0,%z2", operands);
+      output_asm_insn ("<amop_ti_fetch>\t%4,%t0,%t2", operands);
+      break;
+    case UNSPEC_TI_FETCH_NAND_MASK_INVERTED:
+      output_asm_insn ("orn\t%3,%z2,%0", operands);
+      output_asm_insn ("orn\t%4,%t2,%t0", operands);
+      break;
+    case UNSPEC_TI_FETCH_ADD:
+    case UNSPEC_TI_FETCH_SUB:
+      output_asm_insn ("<amop_ti_fetch>.d\t%3,%0,%z2", operands);
+
+      /* Generate carry bit.  */
+      output_asm_insn (
+	<UNSPEC_TI_FETCH> == UNSPEC_TI_FETCH_ADD ? "sltu\t%4,%3,%0"
+						 : "sltu\t%4,%0,%3",
+	operands);
+
+      output_asm_insn ("<amop_ti_fetch>.d\t%4,%t0,%4", operands);
+      output_asm_insn ("<amop_ti_fetch>.d\t%4,%4,%t2", operands);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  output_asm_insn ("sc.q\t%3,%4,%1", operands);
+  output_asm_insn ("beqz\t%3,1b", operands);
+
+  return "";
+}
+  [(set_attr "length" "<size_ti_fetch>")])
+
+(define_expand "atomic_fetch_<amop_ti_fetch>ti"
+  [(set (match_operand:TI 0 "register_operand" "=&r")
+        (match_operand:TI 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:TI
+	  [(match_dup 0)
+	   (match_operand:TI 2 "reg_or_0_operand" "rJ")]
+	  UNSPEC_TI_FETCH_DIRECT))
+   (match_operand:SI    3 "const_int_operand")] ;; model
+  "TARGET_64BIT && ISA_HAS_SCQ"
+{
+  /* Model is ignored as sc.q implies a full barrier.  */
+  emit_insn (gen_atomic_fetch_<amop_ti_fetch>ti_scq (operands[0],
+						     operands[1],
+						     operands[2]));
+  DONE;
+})
+
 (define_insn "atomic_fetch_add<mode>_short"
   [(set (match_operand:SHORT 0 "register_operand" "=&r")
 	(match_operand:SHORT 1 "memory_operand" "+ZB"))
@@ -688,7 +995,7 @@
 	   (match_operand:SI 3 "const_int_operand")] ;; model
 	 UNSPEC_SYNC_OLD_OP))]
   "ISA_HAS_LAM_BH"
-  "amadd%A3.<amo>\t%0,%z2,%1"
+  "amadd%A3.<size>\t%0,%z2,%1"
   [(set (attr "length") (const_int 4))])
 
 (define_expand "atomic_fetch_add<mode>"
@@ -724,7 +1031,7 @@
 			(match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
 	   (match_operand:SI 3 "const_int_operand")] ;; model
 	 UNSPEC_SYNC_OLD_OP))]
-  ""
+  "!ISA_HAS_LAM_BH"
 {
   union loongarch_gen_fn_ptrs generator;
   generator.fn_7 = gen_atomic_cas_value_sub_7_si;
@@ -733,60 +1040,6 @@
   DONE;
 })
 
-(define_expand "atomic_fetch_and<mode>"
-  [(set (match_operand:SHORT 0 "register_operand" "=&r")
-	(match_operand:SHORT 1 "memory_operand" "+ZB"))
-   (set (match_dup 1)
-	(unspec_volatile:SHORT
-	  [(and:SHORT (match_dup 1)
-		      (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
-	   (match_operand:SI 3 "const_int_operand")] ;; model
-	 UNSPEC_SYNC_OLD_OP))]
-  ""
-{
-  union loongarch_gen_fn_ptrs generator;
-  generator.fn_7 = gen_atomic_cas_value_and_7_si;
-  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
-				operands[1], operands[2], operands[3]);
-  DONE;
-})
-
-(define_expand "atomic_fetch_xor<mode>"
-  [(set (match_operand:SHORT 0 "register_operand" "=&r")
-	(match_operand:SHORT 1 "memory_operand" "+ZB"))
-   (set (match_dup 1)
-	(unspec_volatile:SHORT
-	  [(xor:SHORT (match_dup 1)
-		      (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
-	   (match_operand:SI 3 "const_int_operand")] ;; model
-	 UNSPEC_SYNC_OLD_OP))]
-  ""
-{
-  union loongarch_gen_fn_ptrs generator;
-  generator.fn_7 = gen_atomic_cas_value_xor_7_si;
-  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
-				operands[1], operands[2], operands[3]);
-  DONE;
-})
-
-(define_expand "atomic_fetch_or<mode>"
-  [(set (match_operand:SHORT 0 "register_operand" "=&r")
-	(match_operand:SHORT 1 "memory_operand" "+ZB"))
-   (set (match_dup 1)
-	(unspec_volatile:SHORT
-	  [(ior:SHORT (match_dup 1)
-		      (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
-	   (match_operand:SI 3 "const_int_operand")] ;; model
-	 UNSPEC_SYNC_OLD_OP))]
-  ""
-{
-  union loongarch_gen_fn_ptrs generator;
-  generator.fn_7 = gen_atomic_cas_value_or_7_si;
-  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
-				operands[1], operands[2], operands[3]);
-  DONE;
-})
-
 (define_expand "atomic_fetch_nand<mode>"
   [(set (match_operand:SHORT 0 "register_operand" "=&r")
 	(match_operand:SHORT 1 "memory_operand" "+ZB"))
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index e224ade..494f14c 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -2363,8 +2363,14 @@ enum reg_class
 
 #define STACK_GROWS_DOWNWARD 1
 
-#define FRAME_GROWS_DOWNWARD (flag_stack_protect != 0			\
-			      || (flag_sanitize & SANITIZE_ADDRESS) != 0)
+/* Growing the frame downwards allows us to put spills closest to
+   the stack pointer which is good as they are likely to be accessed
+   frequently.  We can also arrange for normal stack usage to place
+   scalars last so that they too are close to the stack pointer.  */
+#define FRAME_GROWS_DOWNWARD ((TARGET_MIPS16			    \
+			       && TARGET_FRAME_GROWS_DOWNWARDS)     \
+			      || (flag_stack_protect != 0	    \
+				  || (flag_sanitize & SANITIZE_ADDRESS) != 0))
 
 /* Size of the area allocated in the frame to save the GP.  */
 
diff --git a/gcc/config/mips/mips.opt b/gcc/config/mips/mips.opt
index e245654..f07db5a 100644
--- a/gcc/config/mips/mips.opt
+++ b/gcc/config/mips/mips.opt
@@ -473,6 +473,10 @@ mframe-header-opt
 Target Var(flag_frame_header_optimization) Optimization
 Optimize frame header.
 
+mgrow-frame-downwards
+Target Var(TARGET_FRAME_GROWS_DOWNWARDS) Init(1) Undocumented
+Change the behaviour to grow the frame downwards.
+
 noasmopt
 Driver
 
diff --git a/gcc/config/pru/pru.cc b/gcc/config/pru/pru.cc
index 322e319..3fdc56e 100644
--- a/gcc/config/pru/pru.cc
+++ b/gcc/config/pru/pru.cc
@@ -941,10 +941,19 @@ pru_init_libfuncs (void)
 
   /* Long long.  */
   set_optab_libfunc (ashr_optab, DImode, "__pruabi_asrll");
-  set_optab_libfunc (smul_optab, DImode, "__pruabi_mpyll");
   set_optab_libfunc (ashl_optab, DImode, "__pruabi_lslll");
   set_optab_libfunc (lshr_optab, DImode, "__pruabi_lsrll");
 
+  if (TARGET_OPT_MUL)
+    {
+      set_optab_libfunc (smul_optab, DImode, "__pruabi_mpyll");
+    }
+  else
+    {
+      set_optab_libfunc (smul_optab, DImode, "__pruabi_softmpyll");
+      set_optab_libfunc (smul_optab, SImode, "__pruabi_softmpyi");
+    }
+
   set_optab_libfunc (sdiv_optab, SImode, "__pruabi_divi");
   set_optab_libfunc (udiv_optab, SImode, "__pruabi_divu");
   set_optab_libfunc (smod_optab, SImode, "__pruabi_remi");
diff --git a/gcc/config/pru/pru.h b/gcc/config/pru/pru.h
index 6c0719b..9d547ed 100644
--- a/gcc/config/pru/pru.h
+++ b/gcc/config/pru/pru.h
@@ -65,6 +65,9 @@
 #undef  ENDFILE_SPEC
 #define ENDFILE_SPEC "%{!mabi=ti:-lgloss} "
 
+#undef  MULTILIB_DEFAULTS
+#define MULTILIB_DEFAULTS { "mloop", "mmul", "mfillzero" }
+
 /* TI ABI mandates that ELF symbols do not start with any prefix.  */
 #undef USER_LABEL_PREFIX
 #define USER_LABEL_PREFIX ""
diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md
index 3504e42..b8ef55b 100644
--- a/gcc/config/pru/pru.md
+++ b/gcc/config/pru/pru.md
@@ -215,7 +215,7 @@
     mov\\t%0, %1
     ldi\\t%0, %%pmem(%1)
     ldi\\t%0, %1
-    fill\\t%0, 4
+    * return TARGET_OPT_FILLZERO ? \"fill\\t%0, 4\" : \"ldi32\\t%0, 0xffffffff\";
     ldi32\\t%0, %1"
   [(set_attr "type" "st,ld,alu,alu,alu,alu,alu")
    (set_attr "length" "4,4,4,4,4,4,8")])
@@ -259,9 +259,11 @@
     case 1:
       return "lb%B1o\\t%b0, %1, %S1";
     case 2:
-      return "zero\\t%F0, 8";
+      return TARGET_OPT_FILLZERO ? "zero\\t%F0, 8"
+				 : "ldi\\t%F0, 0\;ldi\\t%N0, 0";
     case 3:
-      return "fill\\t%F0, 8";
+      return TARGET_OPT_FILLZERO ? "fill\\t%F0, 8"
+				 : "ldi32\\t%F0, 0xffffffff\;mov\\t%N0, %F0";
     case 4:
       /* careful with overlapping source and destination regs.  */
       gcc_assert (GP_REG_P (REGNO (operands[0])));
@@ -502,7 +504,7 @@
 (define_insn "zero_extendqidi2"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
 	(zero_extend:DI (match_operand:QI 1 "register_operand" "0,r")))]
-  ""
+  "TARGET_OPT_FILLZERO"
   "@
     zero\\t%F0.b1, 7
     mov\\t%F0.b0, %1\;zero\\t%F0.b1, 7"
@@ -512,7 +514,7 @@
 (define_insn "zero_extendhidi2"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
 	(zero_extend:DI (match_operand:HI 1 "register_operand" "0,r")))]
-  ""
+  "TARGET_OPT_FILLZERO"
   "@
     zero\\t%F0.b2, 6
     mov\\t%F0.w0, %1\;zero\\t%F0.b2, 6"
@@ -522,7 +524,7 @@
 (define_insn "zero_extendsidi2"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
 	(zero_extend:DI (match_operand:SI 1 "register_operand" "0,r")))]
-  ""
+  "TARGET_OPT_FILLZERO"
   "@
     zero\\t%N0, 4
     mov\\t%F0, %1\;zero\\t%N0, 4"
@@ -535,7 +537,7 @@
 (define_expand "extend<EQS0:mode><EQDHIDI:mode>2"
   [(set (match_operand:EQDHIDI 0 "register_operand" "=r")
 	(sign_extend:EQDHIDI (match_operand:EQS0 1 "register_operand" "r")))]
-  ""
+  "TARGET_OPT_FILLZERO"
 {
   rtx_code_label *skip_hiset_label;
 
@@ -744,7 +746,7 @@
 	(ior:HIDI
 	   (match_operand:HIDI 1 "register_operand" "0")
 	   (match_operand:HIDI 2 "const_fillbytes_operand" "Uf")))]
-  ""
+  "TARGET_OPT_FILLZERO"
 {
   static char line[64];
   pru_byterange r;
@@ -767,7 +769,7 @@
 	(and:HIDI
 	   (match_operand:HIDI 1 "register_operand" "0")
 	   (match_operand:HIDI 2 "const_zerobytes_operand" "Uz")))]
-  ""
+  "TARGET_OPT_FILLZERO"
 {
   static char line[64];
   pru_byterange r;
@@ -1114,7 +1116,8 @@
   /* Try with the more efficient zero/fill patterns first.  */
   if (<LOGICAL_BITOP:CODE> == IOR
       && CONST_INT_P (operands[2])
-      && const_fillbytes_operand (operands[2], DImode))
+      && const_fillbytes_operand (operands[2], DImode)
+      && TARGET_OPT_FILLZERO)
     {
       rtx insn = maybe_gen_pru_ior_fillbytes (DImode,
 					      operands[0],
@@ -1130,7 +1133,8 @@
     }
   if (<LOGICAL_BITOP:CODE> == AND
       && CONST_INT_P (operands[2])
-      && const_zerobytes_operand (operands[2], DImode))
+      && const_zerobytes_operand (operands[2], DImode)
+      && TARGET_OPT_FILLZERO)
     {
       rtx insn = maybe_gen_pru_and_zerobytes (DImode,
 					      operands[0],
@@ -1212,7 +1216,7 @@
   [(set (match_operand:SI 0 "pru_muldst_operand"	   "=Rmd0")
 	(mult:SI (match_operand:SI 1 "pru_mulsrc0_operand" "%Rms0")
 		 (match_operand:SI 2 "pru_mulsrc1_operand" "Rms1")))]
-  ""
+  "TARGET_OPT_MUL"
   "nop\;xin\\t0, %0, 4"
   [(set_attr "type" "alu")
    (set_attr "length" "8")])
diff --git a/gcc/config/pru/pru.opt b/gcc/config/pru/pru.opt
index 8385beb..5206b2a 100644
--- a/gcc/config/pru/pru.opt
+++ b/gcc/config/pru/pru.opt
@@ -39,6 +39,14 @@ mloop
 Target Mask(OPT_LOOP)
 Allow (or do not allow) gcc to use the LOOP instruction.
 
+mmul
+Target Mask(OPT_MUL)
+Allow (or do not allow) gcc to use the PRU multiplier unit.
+
+mfillzero
+Target Mask(OPT_FILLZERO)
+Allow (or do not allow) gcc to use the FILL and ZERO instructions.
+
 mabi=
 Target RejectNegative Joined Enum(pru_abi_t) Var(pru_current_abi) Init(PRU_ABI_GNU) Save
 Select target ABI variant.
diff --git a/gcc/config/pru/pru.opt.urls b/gcc/config/pru/pru.opt.urls
index c87affb..5c57892 100644
--- a/gcc/config/pru/pru.opt.urls
+++ b/gcc/config/pru/pru.opt.urls
@@ -12,6 +12,12 @@ UrlSuffix(gcc/PRU-Options.html#index-mno-relax-1)
 mloop
 UrlSuffix(gcc/PRU-Options.html#index-mloop)
 
+mmul
+UrlSuffix(gcc/PRU-Options.html#index-mmul)
+
+mfillzero
+UrlSuffix(gcc/PRU-Options.html#index-mfillzero)
+
 mabi=
 UrlSuffix(gcc/PRU-Options.html#index-mabi-4)
 
diff --git a/gcc/config/pru/t-multilib b/gcc/config/pru/t-multilib
new file mode 100644
index 0000000..1e3c2b8
--- /dev/null
+++ b/gcc/config/pru/t-multilib
@@ -0,0 +1,29 @@
+# Copyright (C) 2025 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+#
+# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+MULTILIB_OPTIONS   =
+MULTILIB_OPTIONS  += mloop/mno-loop
+MULTILIB_OPTIONS  += mmul/mno-mul
+MULTILIB_OPTIONS  += mfillzero/mno-fillzero
+
+# Build two variants:
+#   - Newer PRU core versions, present in AM335x and later.
+#   - Older PRU core versions, present in AM18xx.
+MULTILIB_REQUIRED  =
+MULTILIB_REQUIRED += mloop/mmul/mfillzero
+MULTILIB_REQUIRED += mno-loop/mno-mul/mno-fillzero
diff --git a/gcc/config/riscv/arch-canonicalize b/gcc/config/riscv/arch-canonicalize
index 5d24f5ed..15a3985 100755
--- a/gcc/config/riscv/arch-canonicalize
+++ b/gcc/config/riscv/arch-canonicalize
@@ -163,7 +163,19 @@ def parse_dep_exts(dep_exts_str):
     ext_name = match.group(1)
     condition_code = match.group(2)
     deps.append({'ext': ext_name, 'type': 'conditional', 'condition': condition_code})
-    conditional_matches.append((match.start(), match.end()))
+    # The conditional_pattern RE matches only the first code block enclosed
+    # in braces.
+    #
+    # Extend the match to the condition block's closing brace, encompassing
+    # all code blocks,  by simply trying to match the numbers of opening
+    # and closing braces.  While crude, this avoids writing a complicated
+    # parse here.
+    closing_braces_left = condition_code.count('{') - condition_code.count('}')
+    condition_end = match.end()
+    while closing_braces_left > 0:
+      condition_end = dep_exts_str.find('}', condition_end)
+      closing_braces_left -= 1
+    conditional_matches.append((match.start(), condition_end))
 
   # Remove conditional dependency blocks from the string
   remaining_str = dep_exts_str
@@ -534,6 +546,11 @@ def run_unit_tests():
     assert extensions[0]['name'] == 'test'
     assert len(extensions[0]['dep_exts']) == 2
 
+  def test_parse_long_condition_block():
+    """Test condition block containing several code blocks."""
+    result = arch_canonicalize("rv32ec", "20191213")
+    assert "rv32ec_zca" in result
+
   # Collect test functions
   test_functions = [
     test_basic_arch_parsing,
@@ -542,7 +559,8 @@ def run_unit_tests():
     test_conditional_dependencies,
     test_parse_dep_exts,
     test_evaluate_conditional_dependency,
-    test_parse_define_riscv_ext
+    test_parse_define_riscv_ext,
+    test_parse_long_condition_block
   ]
 
   # Run tests manually first, then optionally with pytest
diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index 6531996..9695fdc 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1679,6 +1679,26 @@
 ;; Combine vec_duplicate + op.vv to op.vx
 ;; Include
 ;; - vadd.vx
+;; - vsub.vx
+;; - vrsub.vx
+;; - vand.vx
+;; - vor.vx
+;; - vmul.vx
+;; - vdiv.vx
+;; - vdivu.vx
+;; - vrem.vx
+;; - vremu.vx
+;; - vmax.vx
+;; - vmaxu.vx
+;; - vmin.vx
+;; - vminu.vx
+;; - vsadd.vx
+;; - vsaddu.vx
+;; - vssub.vx
+;; - vssubu.vx
+;; - vaadd.vx
+;; - vaaddu.vx
+;; - vmerge.vxm
 ;; =============================================================================
 (define_insn_and_split "*<optab>_vx_<mode>"
  [(set (match_operand:V_VLSI    0 "register_operand")
@@ -1694,6 +1714,8 @@
     riscv_vector::expand_vx_binary_vec_dup_vec (operands[0], operands[2],
 						operands[1], <CODE>,
 						<MODE>mode);
+
+    DONE;
   }
   [(set_attr "type" "vialu")])
 
@@ -1711,6 +1733,8 @@
     riscv_vector::expand_vx_binary_vec_vec_dup (operands[0], operands[1],
 						operands[2], <CODE>,
 						<MODE>mode);
+
+    DONE;
   }
   [(set_attr "type" "vialu")])
 
@@ -1782,6 +1806,69 @@
   }
   [(set_attr "type" "vaalu")])
 
+(define_insn_and_split "*merge_vx_<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand")
+   (if_then_else:V_VLSI
+    (match_operand:<VM>      3 "vector_mask_operand")
+    (vec_duplicate:V_VLSI
+     (match_operand:<VEL>    2 "reg_or_int_operand"))
+    (match_operand:V_VLSI    1 "register_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    insn_code icode = code_for_pred_merge_scalar (<MODE>mode);
+    riscv_vector::emit_vlmax_insn (icode, riscv_vector::MERGE_OP, operands);
+    DONE;
+  }
+  [(set_attr "type" "vimerge")])
+
+(define_insn_and_split "*vmacc_vx_<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand")
+   (plus:V_VLSI
+    (mult:V_VLSI
+     (vec_duplicate:V_VLSI
+      (match_operand:<VEL>   1 "register_operand"))
+     (match_operand:V_VLSI   2 "register_operand"))
+    (match_operand:V_VLSI    3 "register_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    insn_code icode = code_for_pred_mul_plus_vx (<MODE>mode);
+    rtx ops[] = {operands[0], operands[1], operands[2], operands[3],
+		 RVV_VUNDEF(<MODE>mode)};
+    riscv_vector::emit_vlmax_insn (icode, riscv_vector::TERNARY_OP, ops);
+
+    DONE;
+  }
+  [(set_attr "type" "vimuladd")])
+
+(define_insn_and_split "*vnmsac_vx_<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand")
+   (minus:V_VLSI
+    (match_operand:V_VLSI   3 "register_operand")
+    (mult:V_VLSI
+     (vec_duplicate:V_VLSI
+      (match_operand:<VEL>   1 "register_operand"))
+     (match_operand:V_VLSI   2 "register_operand"))))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    insn_code icode = code_for_pred_vnmsac_vx (<MODE>mode);
+    rtx ops[] = {operands[0], operands[1], operands[2], operands[3],
+		 RVV_VUNDEF(<MODE>mode)};
+    riscv_vector::emit_vlmax_insn (icode, riscv_vector::TERNARY_OP, ops);
+
+    DONE;
+  }
+  [(set_attr "type" "vimuladd")])
+
+
 ;; =============================================================================
 ;; Combine vec_duplicate + op.vv to op.vf
 ;; Include
@@ -1962,3 +2049,98 @@
   }
   [(set_attr "type" "vfwmuladd")]
 )
+
+;; vfmul.vf
+(define_insn_and_split "*vfmul_vf_<mode>"
+  [(set (match_operand:V_VLSF 0 "register_operand")
+    (mult:V_VLSF
+      (vec_duplicate:V_VLSF
+        (match_operand:<VEL> 2 "register_operand"))
+      (match_operand:V_VLSF 1 "register_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (MULT, <MODE>mode),
+				   riscv_vector::BINARY_OP_FRM_DYN, operands);
+    DONE;
+  }
+  [(set_attr "type" "vfmul")]
+)
+
+;; vfrdiv.vf
+(define_insn_and_split "*vfrdiv_vf_<mode>"
+  [(set (match_operand:V_VLSF 0 "register_operand")
+    (div:V_VLSF
+      (vec_duplicate:V_VLSF
+	(match_operand:<VEL> 2 "register_operand"))
+      (match_operand:V_VLSF 1 "register_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    riscv_vector::emit_vlmax_insn (code_for_pred_reverse_scalar (DIV, <MODE>mode),
+				   riscv_vector::BINARY_OP_FRM_DYN, operands);
+    DONE;
+  }
+  [(set_attr "type" "vfdiv")]
+)
+
+;; vfmin.vf
+(define_insn_and_split "*vfmin_vf_<mode>"
+  [(set (match_operand:V_VLSF 0 "register_operand")
+    (smin:V_VLSF
+      (vec_duplicate:V_VLSF
+	(match_operand:<VEL> 2 "register_operand"))
+      (match_operand:V_VLSF 1 "register_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (SMIN, <MODE>mode),
+				   riscv_vector::BINARY_OP, operands);
+    DONE;
+  }
+  [(set_attr "type" "vfminmax")]
+)
+
+(define_insn_and_split "*vfmin_vf_ieee_<mode>"
+  [(set (match_operand:V_VLSF 0 "register_operand")
+    (unspec:V_VLSF [
+      (vec_duplicate:V_VLSF
+	(match_operand:<VEL> 2 "register_operand"))
+      (match_operand:V_VLSF 1 "register_operand")
+      ] UNSPEC_VFMIN))]
+  "TARGET_VECTOR && !HONOR_SNANS (<MODE>mode) && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (UNSPEC_VFMIN, <MODE>mode),
+				   riscv_vector::BINARY_OP, operands);
+    DONE;
+  }
+  [(set_attr "type" "vfminmax")]
+)
+
+(define_insn_and_split "*vfmin_vf_ieee_<mode>"
+  [(set (match_operand:V_VLSF 0 "register_operand")
+    (unspec:V_VLSF [
+      (match_operand:V_VLSF 1 "register_operand")
+      (vec_duplicate:V_VLSF
+	(match_operand:<VEL> 2 "register_operand"))
+      ] UNSPEC_VFMIN))]
+  "TARGET_VECTOR && !HONOR_SNANS (<MODE>mode) && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (UNSPEC_VFMIN, <MODE>mode),
+				   riscv_vector::BINARY_OP, operands);
+    DONE;
+  }
+  [(set_attr "type" "vfminmax")]
+)
diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
index 5ecaa19..979e0df 100644
--- a/gcc/config/riscv/constraints.md
+++ b/gcc/config/riscv/constraints.md
@@ -330,3 +330,7 @@
 (define_constraint "Q"
   "An address operand that is valid for a prefetch instruction"
   (match_operand 0 "prefetch_operand"))
+
+(define_address_constraint "ZD"
+  "An address operand that is valid for a mips prefetch instruction"
+  (match_test "riscv_prefetch_offset_address_p (op, mode)"))
diff --git a/gcc/config/riscv/gen-riscv-ext-opt.cc b/gcc/config/riscv/gen-riscv-ext-opt.cc
index 17b8f5b..1ca339c 100644
--- a/gcc/config/riscv/gen-riscv-ext-opt.cc
+++ b/gcc/config/riscv/gen-riscv-ext-opt.cc
@@ -4,50 +4,6 @@
 #include <stdio.h>
 #include "riscv-opts.h"
 
-struct version_t
-{
-  int major;
-  int minor;
-  version_t (int major, int minor,
-	     enum riscv_isa_spec_class spec = ISA_SPEC_CLASS_NONE)
-    : major (major), minor (minor)
-  {}
-  bool operator<(const version_t &other) const
-  {
-    if (major != other.major)
-      return major < other.major;
-    return minor < other.minor;
-  }
-
-  bool operator== (const version_t &other) const
-  {
-    return major == other.major && minor == other.minor;
-  }
-};
-
-static void
-print_ext_doc_entry (const std::string &ext_name, const std::string &full_name,
-		     const std::string &desc,
-		     const std::vector<version_t> &supported_versions)
-{
-  // Implementation of the function to print the documentation entry
-  // for the extension.
-  std::set<version_t> unique_versions;
-  for (const auto &version : supported_versions)
-    unique_versions.insert (version);
-  printf ("@item %s\n", ext_name.c_str ());
-  printf ("@tab");
-  for (const auto &version : unique_versions)
-    {
-      printf (" %d.%d", version.major, version.minor);
-    }
-  printf ("\n");
-  printf ("@tab %s", full_name.c_str ());
-  if (desc.size ())
-    printf (", %s", desc.c_str ());
-  printf ("\n\n");
-}
-
 int
 main ()
 {
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 381f96c..bdb3d22 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -27,10 +27,14 @@
   (ior (match_operand 0 "const_arith_operand")
        (match_operand 0 "register_operand")))
 
+(define_predicate "prefetch_const_operand"
+  (and (match_code "const_int")
+       (match_test "(IN_RANGE (INTVAL (op),  0, 511))")))
+
 ;; REG or REG+D where D fits in a simm12 and has the low 5 bits
 ;; off.  The REG+D form can be reloaded into a temporary if needed
 ;; after FP elimination if that exposes an invalid offset.
-(define_predicate "prefetch_operand"
+(define_predicate "zicbop_prefetch_operand"
   (ior (match_operand 0 "register_operand")
        (and (match_test "const_arith_operand (op, VOIDmode)")
 	    (match_test "(INTVAL (op) & 0x1f) == 0"))
@@ -39,6 +43,20 @@
 	    (match_test "const_arith_operand (XEXP (op, 1), VOIDmode)")
 	    (match_test "(INTVAL (XEXP (op, 1)) & 0x1f) == 0"))))
 
+;; REG or REG+D where D fits in a uimm9
+(define_predicate "mips_prefetch_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_test "prefetch_const_operand (op, VOIDmode)")
+       (and (match_code "plus")
+	(match_test "register_operand (XEXP (op, 0), word_mode)")
+	(match_test "prefetch_const_operand (XEXP (op, 1), VOIDmode)"))))
+
+;; MIPS specific or Standard RISCV Extension
+(define_predicate "prefetch_operand"
+  (if_then_else (match_test "TARGET_XMIPSCBOP")
+      (match_operand 0 "mips_prefetch_operand")
+      (match_operand 0 "zicbop_prefetch_operand")))
+
 (define_predicate "lui_operand"
   (and (match_code "const_int")
        (match_test "LUI_OPERAND (INTVAL (op))")))
diff --git a/gcc/config/riscv/riscv-avlprop.cc b/gcc/config/riscv/riscv-avlprop.cc
index 3031c29..b8547a7 100644
--- a/gcc/config/riscv/riscv-avlprop.cc
+++ b/gcc/config/riscv/riscv-avlprop.cc
@@ -156,6 +156,7 @@ get_insn_vtype_mode (rtx_insn *rinsn)
   extract_insn_cached (rinsn);
   int mode_idx = get_attr_mode_idx (rinsn);
   gcc_assert (mode_idx != INVALID_ATTRIBUTE);
+  gcc_assert (mode_idx < recog_data.n_operands);
   return GET_MODE (recog_data.operand[mode_idx]);
 }
 
@@ -205,6 +206,7 @@ simplify_replace_vlmax_avl (rtx_insn *rinsn, rtx new_avl)
     {
       int index = get_attr_avl_type_idx (rinsn);
       gcc_assert (index != INVALID_ATTRIBUTE);
+      gcc_assert (index < recog_data.n_operands);
       validate_change_or_fail (rinsn, recog_data.operand_loc[index],
 			       get_avl_type_rtx (avl_type::NONVLMAX), false);
     }
@@ -361,6 +363,8 @@ pass_avlprop::get_vlmax_ta_preferred_avl (insn_info *insn) const
 	     is not depend on.  */
 	  extract_insn_cached (use_insn->rtl ());
 	  int merge_op_idx = get_attr_merge_op_idx (use_insn->rtl ());
+	  gcc_assert (merge_op_idx == INVALID_ATTRIBUTE
+		      || merge_op_idx < recog_data.n_operands);
 	  if (merge_op_idx != INVALID_ATTRIBUTE
 	      && !satisfies_constraint_vu (recog_data.operand[merge_op_idx])
 	      && refers_to_regno_p (set->regno (),
@@ -531,7 +535,14 @@ pass_avlprop::execute (function *fn)
 	      && !m_avl_propagations->get (candidate.second)
 	      && imm_avl_p (vtype_mode))
 	    {
-	      rtx new_avl = gen_int_mode (GET_MODE_NUNITS (vtype_mode), Pmode);
+	      /* For segmented operations AVL refers to a single register and
+		 not all NF registers.  Therefore divide the mode size by NF
+		 to obtain the proper AVL.  */
+	      int nf = 1;
+	      if (riscv_v_ext_tuple_mode_p (vtype_mode))
+		nf = get_nf (vtype_mode);
+	      rtx new_avl = gen_int_mode
+	      (GET_MODE_NUNITS (vtype_mode).to_constant () / nf, Pmode);
 	      simplify_replace_vlmax_avl (rinsn, new_avl);
 	    }
 	}
diff --git a/gcc/config/riscv/riscv-cores.def b/gcc/config/riscv/riscv-cores.def
index 98f3470..8f0f630 100644
--- a/gcc/config/riscv/riscv-cores.def
+++ b/gcc/config/riscv/riscv-cores.def
@@ -113,7 +113,7 @@ RISCV_CORE("xt-c908v",        "rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicsr_"
 			      "zvfh_sstc_svinval_svnapot_svpbmt__xtheadba_"
 			      "xtheadbb_xtheadbs_xtheadcmo_xtheadcondmov_"
 			      "xtheadfmemidx_xtheadmac_xtheadmemidx_"
-			      "xtheadmempair_xtheadsync_xtheadvdot",
+			      "xtheadmempair_xtheadsync",
 			      "xt-c908")
 RISCV_CORE("xt-c910",         "rv64imafdc_zicntr_zicsr_zifencei_zihpm_zfh_"
 			      "xtheadba_xtheadbb_xtheadbs_xtheadcmo_"
@@ -121,7 +121,7 @@ RISCV_CORE("xt-c910",         "rv64imafdc_zicntr_zicsr_zifencei_zihpm_zfh_"
 			      "xtheadmemidx_xtheadmempair_xtheadsync",
 			      "xt-c910")
 RISCV_CORE("xt-c910v2",       "rv64imafdc_zicbom_zicbop_zicboz_zicntr_zicond_"
-			      "zicsr_zifencei _zihintntl_zihintpause_zihpm_"
+			      "zicsr_zifencei_zihintntl_zihintpause_zihpm_"
 			      "zawrs_zfa_zfbfmin_zfh_zca_zcb_zcd_zba_zbb_zbc_"
 			      "zbs_sscofpmf_sstc_svinval_svnapot_svpbmt_"
 			      "xtheadba_xtheadbb_xtheadbs_xtheadcmo_"
@@ -135,13 +135,13 @@ RISCV_CORE("xt-c920",         "rv64imafdc_zicntr_zicsr_zifencei_zihpm_zfh_"
 			      "xtheadvector",
 			      "xt-c910")
 RISCV_CORE("xt-c920v2",       "rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicond_"
-			      "zicsr_zifencei _zihintntl_zihintpause_zihpm_"
+			      "zicsr_zifencei_zihintntl_zihintpause_zihpm_"
 			      "zawrs_zfa_zfbfmin_zfh_zca_zcb_zcd_zba_zbb_zbc_"
 			      "zbs_zvfbfmin_zvfbfwma_zvfh_sscofpmf_sstc_"
 			      "svinval_svnapot_svpbmt_xtheadba_xtheadbb_"
 			      "xtheadbs_xtheadcmo_xtheadcondmov_xtheadfmemidx_"
 			      "xtheadmac_xtheadmemidx_xtheadmempair_"
-			      "xtheadsync_xtheadvdot",
+			      "xtheadsync",
 			       "xt-c920v2")
 
 RISCV_CORE("tt-ascalon-d8",   "rv64imafdcv_zic64b_zicbom_zicbop_zicboz_"
diff --git a/gcc/config/riscv/riscv-ext-mips.def b/gcc/config/riscv/riscv-ext-mips.def
index 5d7836d..132f6c1 100644
--- a/gcc/config/riscv/riscv-ext-mips.def
+++ b/gcc/config/riscv/riscv-ext-mips.def
@@ -33,3 +33,16 @@ DEFINE_RISCV_EXT (
   /* BITMASK_GROUP_ID.  */ BITMASK_NOT_YET_ALLOCATED,
   /* BITMASK_BIT_POSITION.  */ BITMASK_NOT_YET_ALLOCATED,
   /* EXTRA_EXTENSION_FLAGS.  */ 0)
+
+DEFINE_RISCV_EXT (
+  /* NAME.  */ xmipscbop,
+  /* UPPERCASE_NAME.  */ XMIPSCBOP,
+  /* FULL_NAME.  */ "Mips Prefetch extension",
+  /* DESC.  */ "",
+  /* URL.  */ ,
+  /* DEP_EXTS.  */ ({}),
+  /* SUPPORTED_VERSIONS.  */ ({{1, 0}}),
+  /* FLAG_GROUP.  */ xmips,
+  /* BITMASK_GROUP_ID.  */ BITMASK_NOT_YET_ALLOCATED,
+  /* BITMASK_BIT_POSITION.  */ BITMASK_NOT_YET_ALLOCATED,
+  /* EXTRA_EXTENSION_FLAGS.  */ 0)
diff --git a/gcc/config/riscv/riscv-ext.opt b/gcc/config/riscv/riscv-ext.opt
index 26d6e68..ced05d2 100644
--- a/gcc/config/riscv/riscv-ext.opt
+++ b/gcc/config/riscv/riscv-ext.opt
@@ -449,3 +449,5 @@ Mask(XTHEADVECTOR) Var(riscv_xthead_subext)
 Mask(XVENTANACONDOPS) Var(riscv_xventana_subext)
 
 Mask(XMIPSCMOV) Var(riscv_xmips_subext)
+
+Mask(XMIPSCBOP) Var(riscv_xmips_subext)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 539321f..46b256d 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -143,6 +143,8 @@ extern void riscv_expand_sstrunc (rtx, rtx);
 extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t);
 extern bool synthesize_ior_xor (rtx_code, rtx [3]);
 extern bool synthesize_and (rtx [3]);
+extern bool synthesize_add (rtx [3]);
+extern bool synthesize_add_extended (rtx [3]);
 
 #ifdef RTX_CODE
 extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
@@ -830,16 +832,18 @@ extern bool th_print_operand_address (FILE *, machine_mode, rtx);
 
 extern bool strided_load_broadcast_p (void);
 extern bool riscv_use_divmod_expander (void);
-void riscv_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
+void riscv_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx, tree, int);
 extern bool
 riscv_option_valid_attribute_p (tree, tree, tree, int);
 extern bool
 riscv_option_valid_version_attribute_p (tree, tree, tree, int);
 extern bool
-riscv_process_target_version_attr (tree, location_t);
+riscv_process_target_version_attr (tree, location_t *);
 extern void
 riscv_override_options_internal (struct gcc_options *);
 extern void riscv_option_override (void);
+extern rtx riscv_prefetch_cookie (rtx, rtx);
+extern bool riscv_prefetch_offset_address_p (rtx, machine_mode);
 
 struct riscv_tune_param;
 /* Information about one micro-arch we know about.  */
diff --git a/gcc/config/riscv/riscv-subset.h b/gcc/config/riscv/riscv-subset.h
index a35537d..4cd860f 100644
--- a/gcc/config/riscv/riscv-subset.h
+++ b/gcc/config/riscv/riscv-subset.h
@@ -52,8 +52,9 @@ private:
   /* Original arch string.  */
   const char *m_arch;
 
-  /* Location of arch string, used for report error.  */
-  location_t m_loc;
+  /* A pointer to the location that should be used for diagnostics,
+     or null if diagnostics should be suppressed.  */
+  location_t *m_loc;
 
   /* Head of subset info list.  */
   riscv_subset_t *m_head;
@@ -70,7 +71,7 @@ private:
   /* Allow adding the same extension more than once.  */
   bool m_allow_adding_dup;
 
-  riscv_subset_list (const char *, location_t);
+  riscv_subset_list (const char *, location_t *);
 
   const char *parsing_subset_version (const char *, const char *, unsigned *,
 				      unsigned *, bool, bool *);
@@ -106,12 +107,12 @@ public:
 
   riscv_subset_list *clone () const;
 
-  static riscv_subset_list *parse (const char *, location_t);
+  static riscv_subset_list *parse (const char *, location_t *);
   const char *parse_single_ext (const char *, bool exact_single_p = true);
 
   int match_score (riscv_subset_list *) const;
 
-  void set_loc (location_t);
+  void set_loc (location_t *);
 
   void set_allow_adding_dup (bool v) { m_allow_adding_dup = v; }
 
@@ -182,7 +183,7 @@ extern void
 riscv_set_arch_by_subset_list (riscv_subset_list *, struct gcc_options *);
 extern bool riscv_minimal_hwprobe_feature_bits (const char *,
 						struct riscv_feature_bits *,
-						location_t);
+						location_t *);
 extern bool
 riscv_ext_is_subset (struct cl_target_option *, struct cl_target_option *);
 
diff --git a/gcc/config/riscv/riscv-target-attr.cc b/gcc/config/riscv/riscv-target-attr.cc
index 8ad3025..5e01c92 100644
--- a/gcc/config/riscv/riscv-target-attr.cc
+++ b/gcc/config/riscv/riscv-target-attr.cc
@@ -34,7 +34,7 @@ namespace {
 class riscv_target_attr_parser
 {
 public:
-  riscv_target_attr_parser (location_t loc)
+  riscv_target_attr_parser (location_t *loc)
     : m_found_arch_p (false)
     , m_found_tune_p (false)
     , m_found_cpu_p (false)
@@ -62,7 +62,7 @@ private:
   bool m_found_cpu_p;
   bool m_found_priority_p;
   riscv_subset_list *m_subset_list;
-  location_t m_loc;
+  location_t *m_loc;
   const  riscv_cpu_info *m_cpu_info;
   const char *m_tune;
   int m_priority;
@@ -102,15 +102,17 @@ riscv_target_attr_parser::parse_arch (const char *str)
     {
       if (TARGET_64BIT && strncmp ("32", str + 2, strlen ("32")) == 0)
 	{
-	  error_at (m_loc, "unexpected arch for %<target()%> attribute: "
-		    "must start with rv64 but found %qs", str);
+	  if (m_loc)
+	    error_at (*m_loc, "unexpected arch for %<target()%> attribute: "
+		      "must start with rv64 but found %qs", str);
 	  goto fail;
 	}
 
       if (!TARGET_64BIT && strncmp ("64", str + 2, strlen ("64")) == 0)
 	{
-	  error_at (m_loc, "unexpected arch for %<target()%> attribute: "
-		    "must start with rv32 but found %qs", str);
+	  if (m_loc)
+	    error_at (*m_loc, "unexpected arch for %<target()%> attribute: "
+		      "must start with rv32 but found %qs", str);
 	  goto fail;
 	}
 
@@ -140,10 +142,9 @@ riscv_target_attr_parser::parse_arch (const char *str)
 	{
 	  if (token[0] != '+')
 	    {
-	      error_at (
-		m_loc,
-		"unexpected arch for %<target()%> attribute: must start "
-		"with + or rv");
+	      if (*m_loc)
+		error_at (*m_loc, "unexpected arch for %<target()%> "
+			  "attribute: must start with + or rv");
 	      goto fail;
 	    }
 
@@ -151,10 +152,9 @@ riscv_target_attr_parser::parse_arch (const char *str)
 	  /* Check parse_single_ext has consume all string.  */
 	  if (*result != '\0')
 	    {
-	      error_at (
-		m_loc,
-		"unexpected arch for %<target()%> attribute: bad "
-		"string found %qs", token);
+	      if (m_loc)
+		error_at (*m_loc, "unexpected arch for %<target()%> "
+			  "attribute: bad string found %qs", token);
 	      goto fail;
 	    }
 
@@ -179,8 +179,8 @@ fail:
 bool
 riscv_target_attr_parser::handle_arch (const char *str)
 {
-  if (m_found_arch_p)
-    error_at (m_loc, "%<target()%> attribute: arch appears more than once");
+  if (m_found_arch_p && m_loc)
+    error_at (*m_loc, "%<target()%> attribute: arch appears more than once");
   m_found_arch_p = true;
   return parse_arch (str);
 }
@@ -190,15 +190,16 @@ riscv_target_attr_parser::handle_arch (const char *str)
 bool
 riscv_target_attr_parser::handle_cpu (const char *str)
 {
-  if (m_found_cpu_p)
-    error_at (m_loc, "%<target()%> attribute: cpu appears more than once");
+  if (m_found_cpu_p && m_loc)
+    error_at (*m_loc, "%<target()%> attribute: cpu appears more than once");
 
   m_found_cpu_p = true;
   const riscv_cpu_info *cpu_info = riscv_find_cpu (str);
 
   if (!cpu_info)
     {
-      error_at (m_loc, "%<target()%> attribute: unknown CPU %qs", str);
+      if (m_loc)
+	error_at (*m_loc, "%<target()%> attribute: unknown CPU %qs", str);
       return false;
     }
 
@@ -218,14 +219,15 @@ riscv_target_attr_parser::handle_cpu (const char *str)
 bool
 riscv_target_attr_parser::handle_tune (const char *str)
 {
-  if (m_found_tune_p)
-    error_at (m_loc, "%<target()%> attribute: tune appears more than once");
+  if (m_found_tune_p && m_loc)
+    error_at (*m_loc, "%<target()%> attribute: tune appears more than once");
   m_found_tune_p = true;
   const struct riscv_tune_info *tune = riscv_parse_tune (str, true);
 
   if (tune == nullptr)
     {
-      error_at (m_loc, "%<target()%> attribute: unknown TUNE %qs", str);
+      if (m_loc)
+	error_at (*m_loc, "%<target()%> attribute: unknown TUNE %qs", str);
       return false;
     }
 
@@ -237,13 +239,15 @@ riscv_target_attr_parser::handle_tune (const char *str)
 bool
 riscv_target_attr_parser::handle_priority (const char *str)
 {
-  if (m_found_priority_p)
-    error_at (m_loc, "%<target()%> attribute: priority appears more than once");
+  if (m_found_priority_p && m_loc)
+    error_at (*m_loc, "%<target()%> attribute: priority appears "
+	      "more than once");
   m_found_priority_p = true;
 
   if (sscanf (str, "%d", &m_priority) != 1)
     {
-      error_at (m_loc, "%<target()%> attribute: invalid priority %qs", str);
+      if (m_loc)
+	error_at (*m_loc, "%<target()%> attribute: invalid priority %qs", str);
       return false;
     }
 
@@ -282,7 +286,7 @@ riscv_target_attr_parser::update_settings (struct gcc_options *opts) const
 
 static bool
 riscv_process_one_target_attr (char *arg_str,
-			       location_t loc,
+			       location_t *loc,
 			       riscv_target_attr_parser &attr_parser,
 			       const struct riscv_attribute_info *attrs)
 {
@@ -290,7 +294,8 @@ riscv_process_one_target_attr (char *arg_str,
 
   if (len == 0)
     {
-      error_at (loc, "malformed %<target()%> attribute");
+      if (loc)
+	error_at (*loc, "malformed %<target()%> attribute");
       return false;
     }
 
@@ -302,10 +307,9 @@ riscv_process_one_target_attr (char *arg_str,
 
   if (!arg)
     {
-      error_at (
-	loc,
-	"attribute %<target(\"%s\")%> does not accept an argument",
-	str_to_check);
+      if (loc)
+	error_at (*loc, "attribute %<target(\"%s\")%> does not "
+		  "accept an argument", str_to_check);
       return false;
     }
 
@@ -324,7 +328,8 @@ riscv_process_one_target_attr (char *arg_str,
       return (&attr_parser->*attr->handler) (arg);
     }
 
-  error_at (loc, "Got unknown attribute %<target(\"%s\")%>", str_to_check);
+  if (loc)
+    error_at (*loc, "Got unknown attribute %<target(\"%s\")%>", str_to_check);
   return false;
 }
 
@@ -347,11 +352,12 @@ num_occurrences_in_str (char c, char *str)
 }
 
 /* Parse the string in ARGS that contains the target attribute information
-   and update the global target options space.  */
+   and update the global target options space.  If LOC is nonnull, report
+   diagnostics against location *LOC, otherwise remain silent.  */
 
 bool
 riscv_process_target_attr (const char *args,
-			   location_t loc,
+			   location_t *loc,
 			   const struct riscv_attribute_info *attrs)
 {
   size_t len = strlen (args);
@@ -387,8 +393,8 @@ riscv_process_target_attr (const char *args,
 
   if (num_attrs != num_semicolons + 1)
     {
-      error_at (loc, "malformed %<target(\"%s\")%> attribute",
-		args);
+      if (loc)
+	error_at (*loc, "malformed %<target(\"%s\")%> attribute", args);
       return false;
     }
 
@@ -399,11 +405,12 @@ riscv_process_target_attr (const char *args,
 }
 
 /* Parse the tree in ARGS that contains the target attribute information
-   and update the global target options space.  */
+   and update the global target options space.  If LOC is nonnull, report
+   diagnostics against *LOC, otherwise remain silent.  */
 
 static bool
 riscv_process_target_attr (tree args,
-			   location_t loc,
+			   location_t *loc,
 			   const struct riscv_attribute_info *attrs)
 {
   if (TREE_CODE (args) == TREE_LIST)
@@ -424,7 +431,8 @@ riscv_process_target_attr (tree args,
 
   if (TREE_CODE (args) != STRING_CST)
     {
-      error_at (loc, "attribute %<target%> argument not a string");
+      if (loc)
+	error_at (*loc, "attribute %<target%> argument not a string");
       return false;
     }
 
@@ -466,7 +474,7 @@ riscv_option_valid_attribute_p (tree fndecl, tree, tree args, int)
 			      TREE_TARGET_OPTION (target_option_default_node));
 
   /* Now we can parse the attributes and set &global_options accordingly.  */
-  ret = riscv_process_target_attr (args, loc, riscv_target_attrs);
+  ret = riscv_process_target_attr (args, &loc, riscv_target_attrs);
   if (ret)
     {
       riscv_override_options_internal (&global_options);
@@ -481,16 +489,19 @@ riscv_option_valid_attribute_p (tree fndecl, tree, tree args, int)
 }
 
 /* Parse the tree in ARGS that contains the target_version attribute
-   information and update the global target options space.  */
+   information and update the global target options space.  If LOC is nonnull,
+   report diagnostics against *LOC, otherwise remain silent.  */
 
 bool
-riscv_process_target_version_attr (tree args, location_t loc)
+riscv_process_target_version_attr (tree args, location_t *loc)
 {
   if (TREE_CODE (args) == TREE_LIST)
     {
       if (TREE_CHAIN (args))
 	{
-	  error ("attribute %<target_version%> has multiple values");
+	  if (loc)
+	    error_at (*loc, "attribute %<target_version%> "
+		      "has multiple values");
 	  return false;
 	}
       args = TREE_VALUE (args);
@@ -498,7 +509,8 @@ riscv_process_target_version_attr (tree args, location_t loc)
 
   if (!args || TREE_CODE (args) != STRING_CST)
     {
-      error ("attribute %<target_version%> argument not a string");
+      if (loc)
+	error_at (*loc, "attribute %<target_version%> argument not a string");
       return false;
     }
 
@@ -541,7 +553,7 @@ riscv_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
     cl_target_option_restore (&global_options, &global_options_set,
 			      TREE_TARGET_OPTION (target_option_current_node));
 
-  ret = riscv_process_target_version_attr (args, loc);
+  ret = riscv_process_target_version_attr (args, &loc);
 
   /* Set up any additional state.  */
   if (ret)
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index c9c8328..b27a0be 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -63,20 +63,37 @@ imm_avl_p (machine_mode mode)
 {
   poly_uint64 nunits = GET_MODE_NUNITS (mode);
 
+  /* For segmented operations AVL refers to a single register and not all NF
+     registers.  Therefore divide the mode size by NF before checking if it is
+     in range.  */
+  int nf = 1;
+  if (riscv_v_ext_tuple_mode_p (mode))
+    nf = get_nf (mode);
+
   return nunits.is_constant ()
 	   /* The vsetivli can only hold register 0~31.  */
-	   ? (IN_RANGE (nunits.to_constant (), 0, 31))
+	   ? (IN_RANGE (nunits.to_constant () / nf, 0, 31))
 	   /* Only allowed in VLS-VLMAX mode.  */
 	   : false;
 }
 
-/* Return true if LEN is equal to NUNITS that out of the range [0, 31].  */
+/* Return true if LEN equals the number of units in MODE if MODE is either a
+   VLA mode or MODE is a VLS mode its size equals the vector size.
+   In that case we can emit a VLMAX insn which can be optimized more easily
+   by the vsetvl pass.  */
+
 static bool
 is_vlmax_len_p (machine_mode mode, rtx len)
 {
   poly_int64 value;
+  if (poly_int_rtx_p (len, &value)
+      && known_eq (value, GET_MODE_NUNITS (mode))
+      && known_eq (GET_MODE_UNIT_SIZE (mode) * value, BYTES_PER_RISCV_VECTOR))
+    return true;
+
   return poly_int_rtx_p (len, &value)
-	 && known_eq (value, GET_MODE_NUNITS (mode));
+    && !GET_MODE_NUNITS (mode).is_constant ()
+    && known_eq (value, GET_MODE_NUNITS (mode));
 }
 
 /* Helper functions for insn_flags && insn_types */
@@ -954,6 +971,26 @@ emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
   emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
 }
 
+/* Function to emit a vslide1up instruction of mode MODE with destination
+   DEST and slideup element ELT.  */
+
+rtx
+expand_slide1up (machine_mode mode, rtx dest, rtx elt)
+{
+  unsigned int unspec
+    = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
+  insn_code icode = code_for_pred_slide (unspec, mode);
+  /* RVV Spec 16.3.1
+     The destination vector register group for vslideup cannot overlap the
+     source vector register group, otherwise the instruction encoding
+     is reserved.  Thus, we need a new register.  */
+  rtx tmp = gen_reg_rtx (mode);
+  rtx ops[] = {tmp, dest, elt};
+  emit_vlmax_insn (icode, BINARY_OP, ops);
+  return tmp;
+}
+
+
 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
    https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
 
@@ -1175,16 +1212,7 @@ expand_vector_init_trailing_same_elem (rtx target,
     {
       rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
       for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
-	{
-	  unsigned int unspec
-	    = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
-	  insn_code icode = code_for_pred_slide (unspec, mode);
-	  rtx tmp = gen_reg_rtx (mode);
-	  rtx ops[] = {tmp, dup, builder.elt (i)};
-	  emit_vlmax_insn (icode, BINARY_OP, ops);
-	  /* slide1up need source and dest to be different REG.  */
-	  dup = tmp;
-	}
+	dup = expand_slide1up (mode, dup, builder.elt (i));
 
       emit_move_insn (target, dup);
       return true;
@@ -1717,6 +1745,77 @@ expand_const_vector_stepped (rtx target, rtx src, rvv_builder *builder)
   gcc_unreachable ();
 }
 
+/* We don't actually allow this case in legitimate_constant_p but
+   the middle-end still expects us to handle it in an expander
+   (see PR121334).  This is assumed to happen very rarely so the
+   implementation is not very efficient, particularly
+   for short vectors.
+*/
+
+static void
+expand_const_vector_onestep (rtx target, rvv_builder &builder)
+{
+  machine_mode mode = GET_MODE (target);
+  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
+  gcc_assert (builder.nelts_per_pattern () == 2);
+
+  /* We have n encoded patterns
+       {csta_0, cstb_0},
+       {csta_1, cstb_1},
+       ...
+       {csta_{n-1}, cstb_{n-1}}
+     which should become one vector:
+       {csta_0, csta_1, ..., csta_{n-1},
+	cstb_0, cstb_1, ..., cstb_{n-1},
+	...
+	cstb_0, cstb_1, ..., cstb_{n-1}}.
+
+     In order to achieve this we create a permute/gather constant
+	sel = {0, 1, ..., n - 1, 0, 1, ..., n - 1, ...}
+     and two vectors
+	va = {csta_0, csta_1, ..., csta_{n-1}},
+	vb = {cstb_0, cstb_1, ..., cstb_{n-1}}.
+
+     Then we use a VLMAX gather to "broadcast" vb and afterwards
+     overwrite the first n elements with va.  */
+
+  int n = builder.npatterns ();
+  /* { 0, 1, 2, ..., n - 1 }.  */
+  rtx vid = gen_reg_rtx (mode);
+  expand_vec_series (vid, const0_rtx, const1_rtx);
+
+  /* { 0, 1, ..., n - 1, 0, 1, ..., n - 1, ... }.  */
+  rtx sel = gen_reg_rtx (mode);
+  rtx and_ops[] = {sel, vid, GEN_INT (n)};
+  emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP, and_ops);
+
+  /* va = { ELT (0), ELT (1), ... ELT (n - 1) }.  */
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx ops1[] = {tmp1, builder.elt (0)};
+  expand_broadcast (mode, ops1);
+  for (int i = 1; i < n; i++)
+    tmp1 = expand_slide1up (mode, tmp1, builder.elt (i));
+
+  /* vb = { ELT (n), ELT (n + 1), ... ELT (2 * n - 1) }.  */
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx ops2[] = {tmp2, builder.elt (n)};
+  expand_broadcast (mode, ops2);
+  for (int i = 1; i < n; i++)
+    tmp2 = expand_slide1up (mode, tmp2, builder.elt (n + i));
+
+  /* Duplicate vb.  */
+  rtx tmp3 = gen_reg_rtx (mode);
+  emit_vlmax_gather_insn (tmp3, tmp2, sel);
+
+  /* Overwrite the first n - 1 elements with va.  */
+  rtx dest = gen_reg_rtx (mode);
+  insn_code icode = code_for_pred_mov (mode);
+  rtx ops3[] = {dest, tmp3, tmp1};
+  emit_nonvlmax_insn (icode, __MASK_OP_TUMA | UNARY_OP_P, ops3, GEN_INT (n));
+
+  emit_move_insn (target, dest);
+}
+
 static void
 expand_const_vector (rtx target, rtx src)
 {
@@ -1744,6 +1843,8 @@ expand_const_vector (rtx target, rtx src)
 
   if (CONST_VECTOR_DUPLICATE_P (src))
     return expand_const_vector_duplicate (target, &builder);
+  else if (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2)
+    return expand_const_vector_onestep (target, builder);
   else if (CONST_VECTOR_STEPPED_P (src))
     return expand_const_vector_stepped (target, src, &builder);
 
@@ -2648,8 +2749,14 @@ expand_vector_init_merge_repeating_sequence (rtx target,
     = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
   uint64_t full_nelts = builder.full_nelts ().to_constant ();
 
+  gcc_assert (builder.nelts_per_pattern () == 1
+	      || builder.nelts_per_pattern () == 2);
+
+  rtx first
+    = builder.nelts_per_pattern () == 1 ? builder.elt (0) : builder.elt (1);
+
   /* Step 1: Broadcast the first pattern.  */
-  rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
+  rtx ops[] = {target, force_reg (builder.inner_mode (), first)};
   expand_broadcast (builder.mode (), ops);
   /* Step 2: Merge the rest iteration of pattern.  */
   for (unsigned int i = 1; i < builder.npatterns (); i++)
@@ -2677,7 +2784,10 @@ expand_vector_init_merge_repeating_sequence (rtx target,
       emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
 
       /* Step 2-2: Merge pattern according to the mask.  */
-      rtx ops[] = {target, target, builder.elt (i), mask};
+      unsigned int which = i;
+      if (builder.nelts_per_pattern () == 2)
+	which = 2 * which + 1;
+      rtx ops[] = {target, target, builder.elt (which), mask};
       emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
 			MERGE_OP, ops);
     }
@@ -3220,15 +3330,17 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
   mask_mode = get_mask_mode (data_mode);
   rtx mask = gen_reg_rtx (mask_mode);
   rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
+  bool overlap = reg_overlap_mentioned_p (target, op1);
+  rtx tmp_target = overlap ? gen_reg_rtx (data_mode) : target;
 
   /* Step 1: generate a mask that should select everything >= nunits into the
    * mask.  */
   expand_vec_cmp (mask, GEU, sel_mod, max_sel);
 
-  /* Step2: gather every op0 values indexed by sel into target,
+  /* Step2: gather every op0 values indexed by sel into TMP_TARGET,
 	    we don't need to care about the result of the element
 	    whose index >= nunits.  */
-  emit_vlmax_gather_insn (target, op0, sel_mod);
+  emit_vlmax_gather_insn (tmp_target, op0, sel_mod);
 
   /* Step3: shift the range from (nunits, max_of_mode] to
 	    [0, max_of_mode - nunits].  */
@@ -3238,7 +3350,10 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
 
   /* Step4: gather those into the previously masked-out elements
 	    of target.  */
-  emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
+  emit_vlmax_masked_gather_mu_insn (tmp_target, op1, tmp, mask);
+
+  if (overlap)
+    emit_move_insn (tmp_target, target);
 }
 
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV.  */
@@ -4078,11 +4193,7 @@ shuffle_off_by_one_patterns (struct expand_vec_perm_d *d)
       emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
 
       /* Insert the scalar into element 0.  */
-      unsigned int unspec
-	= FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
-      insn_code icode = code_for_pred_slide (unspec, d->vmode);
-      rtx ops[] = {d->target, d->op1, tmp};
-      emit_vlmax_insn (icode, BINARY_OP, ops);
+      expand_slide1up (d->vmode, d->op1, tmp);
     }
 
   return true;
@@ -4376,13 +4487,11 @@ expand_strided_load (machine_mode mode, rtx *ops)
   int idx = 4;
   get_else_operand (ops[idx++]);
   rtx len = ops[idx];
-  poly_int64 len_val;
 
   insn_code icode = code_for_pred_strided_load (mode);
   rtx emit_ops[] = {v_reg, mask, gen_rtx_MEM (mode, base), stride};
 
-  if (poly_int_rtx_p (len, &len_val)
-      && known_eq (len_val, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
     emit_vlmax_insn (icode, BINARY_OP_TAMA, emit_ops);
   else
     {
@@ -4400,11 +4509,9 @@ expand_strided_store (machine_mode mode, rtx *ops)
   rtx stride = ops[1];
   rtx mask = ops[3];
   rtx len = ops[4];
-  poly_int64 len_val;
   rtx vl_type;
 
-  if (poly_int_rtx_p (len, &len_val)
-      && known_eq (len_val, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
     {
       len = gen_reg_rtx (Pmode);
       emit_vlmax_vsetvl (mode, len);
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 44ef44a..5e6cb67 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -607,7 +607,7 @@ costs::need_additional_vector_vars_p (stmt_vec_info stmt_info,
   if (type == load_vec_info_type || type == store_vec_info_type)
     {
       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
-	  && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER)
+	  && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
 	return true;
 
       machine_mode mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index e0d8904..591122f 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -3685,7 +3685,8 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx src)
       /* This test can fail if (for example) we want a HF and Z[v]fh is
 	 not enabled.  In that case we just want to let the standard
 	 expansion path run.  */
-      if (riscv_vector::get_vector_mode (smode, nunits).exists (&vmode))
+      if (riscv_vector::get_vector_mode (smode, nunits).exists (&vmode)
+	  && gen_lowpart_common (vmode, SUBREG_REG (src)))
 	{
 	  rtx v = gen_lowpart (vmode, SUBREG_REG (src));
 	  rtx int_reg = dest;
@@ -3958,41 +3959,6 @@ riscv_extend_cost (rtx op, bool unsigned_p)
   return COSTS_N_INSNS (2);
 }
 
-/* Return the cost of the vector binary rtx like add, minus, mult.
-   The cost of scalar2vr_cost will be appended if there one of the
-   op comes from the VEC_DUPLICATE.  */
-
-static int
-get_vector_binary_rtx_cost (rtx x, int scalar2vr_cost)
-{
-  gcc_assert (riscv_v_ext_mode_p (GET_MODE (x)));
-
-  rtx neg;
-  rtx op_0;
-  rtx op_1;
-
-  if (GET_CODE (x) == UNSPEC)
-    {
-      op_0 = XVECEXP (x, 0, 0);
-      op_1 = XVECEXP (x, 0, 1);
-    }
-  else
-    {
-      op_0 = XEXP (x, 0);
-      op_1 = XEXP (x, 1);
-    }
-
-  if (GET_CODE (op_0) == VEC_DUPLICATE
-      || GET_CODE (op_1) == VEC_DUPLICATE)
-    return (scalar2vr_cost + 1) * COSTS_N_INSNS (1);
-  else if (GET_CODE (neg = op_0) == NEG
-	   && (GET_CODE (op_1) == VEC_DUPLICATE
-	       || GET_CODE (XEXP (neg, 0)) == VEC_DUPLICATE))
-    return (scalar2vr_cost + 1) * COSTS_N_INSNS (1);
-  else
-    return COSTS_N_INSNS (1);
-}
-
 /* Implement TARGET_RTX_COSTS.  */
 
 #define SINGLE_SHIFT_COST 1
@@ -4014,73 +3980,20 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
 	{
 	case SET:
 	  {
-	    switch (GET_CODE (x))
+	    if (GET_CODE (x) == VEC_DUPLICATE)
+	      *total = (scalar2vr_cost + 1) * COSTS_N_INSNS (1);
+	    else
 	      {
-	      case VEC_DUPLICATE:
-		*total = gr2vr_cost * COSTS_N_INSNS (1);
-		break;
-	      case IF_THEN_ELSE:
-		{
-		  rtx op = XEXP (x, 1);
+		int vec_dup_count = 0;
+		subrtx_var_iterator::array_type array;
 
-		  switch (GET_CODE (op))
-		    {
-		    case DIV:
-		    case UDIV:
-		    case MOD:
-		    case UMOD:
-		    case US_PLUS:
-		    case US_MINUS:
-		    case SS_PLUS:
-		    case SS_MINUS:
-		      *total = get_vector_binary_rtx_cost (op, scalar2vr_cost);
-		      break;
-		    case UNSPEC:
-		      {
-			switch (XINT (op, 1))
-			  {
-			  case UNSPEC_VAADDU:
-			  case UNSPEC_VAADD:
-			    *total
-			      = get_vector_binary_rtx_cost (op, scalar2vr_cost);
-			    break;
-			  default:
-			    *total = COSTS_N_INSNS (1);
-			    break;
-			  }
-		      }
-		      break;
-		    default:
-		      *total = COSTS_N_INSNS (1);
-		      break;
-		    }
-		}
-		break;
-	      case PLUS:
-	      case MINUS:
-	      case AND:
-	      case IOR:
-	      case XOR:
-	      case MULT:
-	      case SMAX:
-	      case UMAX:
-	      case SMIN:
-	      case UMIN:
-		{
-		  rtx op;
-		  rtx op_0 = XEXP (x, 0);
-		  rtx op_1 = XEXP (x, 1);
+		FOR_EACH_SUBRTX_VAR (iter, array, x, ALL)
+		  if (GET_CODE (*iter) == VEC_DUPLICATE)
+		    vec_dup_count++;
 
-		  if (GET_CODE (op = op_0) == MULT
-		      || GET_CODE (op = op_1) == MULT)
-		    *total = get_vector_binary_rtx_cost (op, scalar2vr_cost);
-		  else
-		    *total = get_vector_binary_rtx_cost (x, scalar2vr_cost);
-		}
-		break;
-	      default:
-		*total = COSTS_N_INSNS (1);
-		break;
+		int total_vec_dup_cost = vec_dup_count * scalar2vr_cost;
+
+		*total = COSTS_N_INSNS (1) * (total_vec_dup_cost + 1);
 	      }
 	  }
 	  break;
@@ -5532,9 +5445,9 @@ canonicalize_comparands (rtx_code code, rtx *op0, rtx *op1)
 
   /* We might have been handed back a SUBREG.  Just to make things
      easy, force it into a REG.  */
-  if (!REG_P (*op0) && !CONST_INT_P (*op0))
+  if (!REG_P (*op0) && !CONST_INT_P (*op0) && INTEGRAL_MODE_P (GET_MODE (*op0)))
     *op0 = force_reg (word_mode, *op0);
-  if (!REG_P (*op1) && !CONST_INT_P (*op1))
+  if (!REG_P (*op1) && !CONST_INT_P (*op1) && INTEGRAL_MODE_P (GET_MODE (*op1)))
     *op1 = force_reg (word_mode, *op1);
 }
 
@@ -6213,7 +6126,8 @@ riscv_pass_vls_aggregate_in_gpr (struct riscv_arg_info *info, machine_mode mode,
    For a library call, FNTYPE is 0.  */
 
 void
-riscv_init_cumulative_args (CUMULATIVE_ARGS *cum, tree fntype, rtx, tree, int)
+riscv_init_cumulative_args (CUMULATIVE_ARGS *cum, const_tree fntype,
+			    rtx, tree, int)
 {
   memset (cum, 0, sizeof (*cum));
 
@@ -6494,30 +6408,44 @@ riscv_arg_partial_bytes (cumulative_args_t cum,
   return arg.stack_p ? arg.num_gprs * UNITS_PER_WORD : 0;
 }
 
-/* Implement FUNCTION_VALUE and LIBCALL_VALUE.  For normal calls,
-   VALTYPE is the return type and MODE is VOIDmode.  For libcalls,
-   VALTYPE is null and MODE is the mode of the return value.  */
+/* Implements hook TARGET_FUNCTION_VALUE.  */
 
 rtx
-riscv_function_value (const_tree type, const_tree func, machine_mode mode)
+riscv_function_value (const_tree ret_type, const_tree fn_decl_or_type,
+		      bool)
 {
   struct riscv_arg_info info;
   CUMULATIVE_ARGS args;
 
-  if (type)
+  if (fn_decl_or_type)
     {
-      int unsigned_p = TYPE_UNSIGNED (type);
+      const_tree fntype = TREE_CODE (fn_decl_or_type) == FUNCTION_DECL ?
+			    TREE_TYPE (fn_decl_or_type) : fn_decl_or_type;
+      riscv_init_cumulative_args (&args, fntype, NULL_RTX, NULL_TREE, 0);
+    }
+  else
+    memset (&args, 0, sizeof args);
 
-      mode = TYPE_MODE (type);
+  int unsigned_p = TYPE_UNSIGNED (ret_type);
 
-      /* Since TARGET_PROMOTE_FUNCTION_MODE unconditionally promotes,
-	 return values, promote the mode here too.  */
-      mode = promote_function_mode (type, mode, &unsigned_p, func, 1);
-    }
+  machine_mode mode = TYPE_MODE (ret_type);
 
-  memset (&args, 0, sizeof args);
+  /* Since TARGET_PROMOTE_FUNCTION_MODE unconditionally promotes,
+     return values, promote the mode here too.  */
+  mode = promote_function_mode (ret_type, mode, &unsigned_p, fn_decl_or_type, 1);
 
-  return riscv_get_arg_info (&info, &args, mode, type, true, true);
+  return riscv_get_arg_info (&info, &args, mode, ret_type, true, true);
+}
+
+/* Implements hook TARGET_LIBCALL_VALUE.  */
+
+rtx
+riscv_libcall_value (machine_mode mode, const_rtx fun ATTRIBUTE_UNUSED)
+{
+  struct riscv_arg_info info;
+  CUMULATIVE_ARGS args;
+  memset (&args, 0, sizeof args);
+  return riscv_get_arg_info (&info, &args, mode, NULL_TREE, true, true);
 }
 
 /* Implement TARGET_PASS_BY_REFERENCE. */
@@ -14037,10 +13965,13 @@ riscv_c_mode_for_floating_type (enum tree_index ti)
   return default_mode_for_floating_type (ti);
 }
 
-/* This parses the attribute arguments to target_version in DECL and modifies
-   the feature mask and priority required to select those targets.  */
+/* Parse the attribute arguments to target_version in DECL and modify
+   the feature mask and priority required to select those targets.
+   If LOC is nonnull, report diagnostics against *LOC, otherwise
+   remain silent.  */
 static void
 parse_features_for_version (tree decl,
+			    location_t *loc,
 			    struct riscv_feature_bits &res,
 			    int &priority)
 {
@@ -14071,14 +14002,12 @@ parse_features_for_version (tree decl,
   cl_target_option_restore (&global_options, &global_options_set,
 			    default_opts);
 
-  riscv_process_target_version_attr (TREE_VALUE (version_attr),
-				     DECL_SOURCE_LOCATION (decl));
+  riscv_process_target_version_attr (TREE_VALUE (version_attr), loc);
 
   priority = global_options.x_riscv_fmv_priority;
   const char *arch_string = global_options.x_riscv_arch_string;
   bool parse_res
-    = riscv_minimal_hwprobe_feature_bits (arch_string, &res,
-					  DECL_SOURCE_LOCATION (decl));
+    = riscv_minimal_hwprobe_feature_bits (arch_string, &res, loc);
   gcc_assert (parse_res);
 
   cl_target_option_restore (&global_options, &global_options_set,
@@ -14135,8 +14064,8 @@ riscv_compare_version_priority (tree decl1, tree decl2)
   struct riscv_feature_bits mask1, mask2;
   int prio1, prio2;
 
-  parse_features_for_version (decl1, mask1, prio1);
-  parse_features_for_version (decl2, mask2, prio2);
+  parse_features_for_version (decl1, nullptr, mask1, prio1);
+  parse_features_for_version (decl2, nullptr, mask2, prio2);
 
   return compare_fmv_features (mask1, mask2, prio1, prio2);
 }
@@ -14439,6 +14368,7 @@ dispatch_function_versions (tree dispatch_decl,
       version_info.version_decl = version_decl;
       // Get attribute string, parse it and find the right features.
       parse_features_for_version (version_decl,
+				  &DECL_SOURCE_LOCATION (version_decl),
 				  version_info.features,
 				  version_info.prio);
       function_versions.push_back (version_info);
@@ -15441,6 +15371,217 @@ synthesize_and (rtx operands[3])
   return true;
 }
 
+/* Synthesize OPERANDS[0] = OPERANDS[1] + OPERANDS[2].
+
+    OPERANDS[0] and OPERANDS[1] will be a REG and may be the same
+    REG.
+
+    OPERANDS[2] is a CONST_INT.
+
+    Return TRUE if the operation was fully synthesized and the caller
+    need not generate additional code.  Return FALSE if the operation
+    was not synthesized and the caller is responsible for emitting the
+    proper sequence.  */
+
+bool
+synthesize_add (rtx operands[3])
+{
+  /* Trivial cases that don't need synthesis.  */
+  if (SMALL_OPERAND (INTVAL (operands[2])))
+    return false;
+
+  int budget1 = riscv_const_insns (operands[2], true);
+  int budget2 = riscv_const_insns (GEN_INT (-INTVAL (operands[2])), true);
+
+  HOST_WIDE_INT ival = INTVAL (operands[2]);
+
+  /* If we can emit two addi insns then that's better than synthesizing
+     the constant into a temporary, then adding the temporary to the
+     other input.  The exception is when the constant can be loaded
+     in a single instruction which can issue whenever its convenient.  */
+  if (SUM_OF_TWO_S12 (ival) && budget1 >= 2)
+    {
+      HOST_WIDE_INT saturated = HOST_WIDE_INT_M1U << (IMM_BITS - 1);
+
+      if (ival >= 0)
+	saturated = ~saturated;
+
+      ival -= saturated;
+
+      rtx x = gen_rtx_PLUS (word_mode, operands[1], GEN_INT (saturated));
+      emit_insn (gen_rtx_SET (operands[0], x));
+      rtx output = gen_rtx_PLUS (word_mode, operands[0], GEN_INT (ival));
+      emit_insn (gen_rtx_SET (operands[0], output));
+      return true;
+    }
+
+  /* If we can shift the constant by 1, 2, or 3 bit positions
+     and the result is a cheaper constant, then do so.  */
+  ival = INTVAL (operands[2]);
+  if (TARGET_ZBA
+      && (((ival % 2) == 0 && budget1
+	   > riscv_const_insns (GEN_INT (ival >> 1), true))
+	   || ((ival % 4) == 0 && budget1
+	       > riscv_const_insns (GEN_INT (ival >> 2), true))
+	   || ((ival % 8) == 0 && budget1
+	       > riscv_const_insns (GEN_INT (ival >> 3), true))))
+    {
+      // Load the shifted constant into a temporary
+      int shct = ctz_hwi (ival);
+
+      /* We can handle shifting up to 3 bit positions via shNadd.  */
+      if (shct > 3)
+	shct = 3;
+
+      /* The adjusted constant may still need synthesis, so do not copy
+	 it directly into register.  Let the expander handle it.  */
+      rtx tmp = force_reg (word_mode, GEN_INT (ival >> shct));
+
+      /* Generate shift-add of temporary and operands[1]
+	 into the final destination.  */
+      rtx x = gen_rtx_ASHIFT (word_mode, tmp, GEN_INT (shct));
+      rtx output = gen_rtx_PLUS (word_mode, x, operands[1]);
+      emit_insn (gen_rtx_SET (operands[0], output));
+      return true;
+    }
+
+  /* If the negated constant is cheaper than the original, then negate
+     the constant and use sub.  */
+  if (budget2 < budget1)
+    {
+      // load -INTVAL (operands[2]) into a temporary
+      rtx tmp = force_reg (word_mode, GEN_INT (-INTVAL (operands[2])));
+
+      // subtract operads[2] from operands[1]
+      rtx output = gen_rtx_MINUS (word_mode, operands[1], tmp);
+      emit_insn (gen_rtx_SET (operands[0], output));
+      return true;
+    }
+
+  /* No add synthesis was found.  Synthesize the constant into
+     a temporary and use that.  */
+  rtx x = force_reg (word_mode, operands[2]);
+  x = gen_rtx_PLUS (word_mode, operands[1], x);
+  emit_insn (gen_rtx_SET (operands[0], x));
+  return true;
+}
+
+/*  Synthesize OPERANDS[0] = OPERANDS[1] + OPERANDS[2].
+
+    For 32-bit object cases with a 64-bit target.
+
+    OPERANDS[0] and OPERANDS[1] will be a REG and may be the same
+    REG.
+
+    OPERANDS[2] is a CONST_INT.
+
+    Return TRUE if the operation was fully synthesized and the caller
+    need not generate additional code.  Return FALSE if the operation
+    was not synthesized and the caller is responsible for emitting the
+    proper sequence.  */
+
+
+bool
+synthesize_add_extended (rtx operands[3])
+{
+
+/*  If operands[2] is a 12-bit signed immediate,
+    no synthesis needs to be done.  */
+
+  if (SMALL_OPERAND (INTVAL (operands[2])))
+    return false;
+
+  HOST_WIDE_INT ival = INTVAL (operands[2]);
+  int budget1 = riscv_const_insns (operands[2], true);
+  int budget2 = riscv_const_insns (GEN_INT (-INTVAL (operands[2])), true);
+
+/*  If operands[2] can be split into two 12-bit signed immediates,
+    split add into two adds.  */
+
+  if (SUM_OF_TWO_S12 (ival))
+    {
+      HOST_WIDE_INT saturated = HOST_WIDE_INT_M1U << (IMM_BITS - 1);
+
+      if (ival >= 0)
+	saturated = ~saturated;
+
+      ival -= saturated;
+
+      rtx temp = gen_reg_rtx (DImode);
+      emit_insn (gen_addsi3_extended (temp, operands[1], GEN_INT (saturated)));
+      temp = gen_lowpart (SImode, temp);
+      SUBREG_PROMOTED_VAR_P (temp) = 1;
+      SUBREG_PROMOTED_SET (temp, SRP_SIGNED);
+      emit_insn (gen_rtx_SET (operands[0], temp));
+      rtx t = gen_reg_rtx (DImode);
+      emit_insn (gen_addsi3_extended (t, operands[0], GEN_INT (ival)));
+      t = gen_lowpart (SImode, t);
+      SUBREG_PROMOTED_VAR_P (t) = 1;
+      SUBREG_PROMOTED_SET (t, SRP_SIGNED);
+      emit_move_insn (operands[0], t);
+      return true;
+    }
+
+
+/*  If the negated value is cheaper to synthesize, subtract that from
+    operands[1]. */
+
+  if (budget2 < budget1)
+    {
+      rtx tmp = gen_reg_rtx (SImode);
+      emit_insn (gen_rtx_SET (tmp, GEN_INT (-INTVAL (operands[2]))));
+
+      rtx t = gen_reg_rtx (DImode);
+      emit_insn (gen_subsi3_extended (t, operands[1], tmp));
+      t = gen_lowpart (SImode, t);
+      SUBREG_PROMOTED_VAR_P (t) = 1;
+      SUBREG_PROMOTED_SET (t, SRP_SIGNED);
+      emit_move_insn (operands[0], t);
+      return true;
+    }
+
+  rtx tsrc = force_reg (SImode, operands[2]);
+  rtx tdest = gen_reg_rtx (DImode);
+  emit_insn (gen_addsi3_extended (tdest, operands[1], tsrc));
+  tdest = gen_lowpart (SImode, tdest);
+  SUBREG_PROMOTED_VAR_P (tdest) = 1;
+  SUBREG_PROMOTED_SET (tdest, SRP_SIGNED);
+  emit_move_insn (operands[0], tdest);
+  return true;
+
+}
+
+
+/*
+    HINT : argument specify the target cache
+
+    TODO : LOCALITY is unused.
+
+    Return the first operand of the associated PREF or PREFX insn.  */
+rtx
+riscv_prefetch_cookie (rtx hint, rtx locality)
+{
+  return (GEN_INT (INTVAL (hint)
+		   + CacheHint::DCACHE_HINT + INTVAL (locality) * 0));
+}
+
+/* Return true if X is a legitimate address with offset for prefetch.
+   MODE is the mode of the value being accessed.  */
+bool
+riscv_prefetch_offset_address_p (rtx x, machine_mode mode)
+{
+  struct riscv_address_info addr;
+
+  if (riscv_classify_address (&addr, x, mode, false)
+      && addr.type == ADDRESS_REG)
+    {
+      if (TARGET_XMIPSCBOP)
+	return (CONST_INT_P (addr.offset)
+		&& MIPS_RISCV_9BIT_OFFSET_P (INTVAL (addr.offset)));
+    }
+
+  return true;
+}
 
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
@@ -15804,6 +15945,12 @@ synthesize_and (rtx operands[3])
 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P riscv_vector_mode_supported_any_target_p
 
+#undef TARGET_FUNCTION_VALUE
+#define TARGET_FUNCTION_VALUE riscv_function_value
+
+#undef TARGET_LIBCALL_VALUE
+#define TARGET_LIBCALL_VALUE riscv_libcall_value
+
 #undef TARGET_FUNCTION_VALUE_REGNO_P
 #define TARGET_FUNCTION_VALUE_REGNO_P riscv_function_value_regno_p
 
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 29342d8..9146571 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -765,12 +765,6 @@ enum reg_class
 
 #define CALLEE_SAVED_FREG_NUMBER(REGNO) CALLEE_SAVED_REG_NUMBER (REGNO - 32)
 
-#define LIBCALL_VALUE(MODE) \
-  riscv_function_value (NULL_TREE, NULL_TREE, MODE)
-
-#define FUNCTION_VALUE(VALTYPE, FUNC) \
-  riscv_function_value (VALTYPE, FUNC, VOIDmode)
-
 /* 1 if N is a possible register number for function argument passing.
    We have no FP argument registers when soft-float.  */
 
@@ -1325,4 +1319,15 @@ extern void riscv_remove_unneeded_save_restore_calls (void);
 
 #define TARGET_HAS_FMV_TARGET_ATTRIBUTE 0
 
+/* mips pref valid offset range.  */
+#define MIPS_RISCV_9BIT_OFFSET_P(OFFSET) (IN_RANGE (OFFSET, 0, 511))
+
+/* mips pref cache hint type.  */
+typedef enum {
+    ICACHE_HINT = 0 << 3,
+    DCACHE_HINT = 1 << 3,
+    SCACHE_HINT = 2 << 3,
+    TCACHE_HINT = 3 << 3
+} CacheHint;
+
 #endif /* ! GCC_RISCV_H */
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 578dd43..d34405c 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -712,24 +712,45 @@
    (set_attr "mode" "SI")])
 
 (define_expand "addsi3"
-  [(set (match_operand:SI          0 "register_operand" "=r,r")
-	(plus:SI (match_operand:SI 1 "register_operand" " r,r")
-		 (match_operand:SI 2 "arith_operand"    " r,I")))]
+  [(set (match_operand:SI          0 "register_operand")
+	(plus:SI (match_operand:SI 1 "register_operand")
+		 (match_operand:SI 2 "reg_or_const_int_operand")))]
   ""
 {
+  /* We may be able to find a faster sequence, if so, then we are
+     done.  Otherwise let expansion continue normally.  */
+  if (CONST_INT_P (operands[2])
+      && ((!TARGET_64BIT && synthesize_add (operands))
+	  || (TARGET_64BIT && synthesize_add_extended (operands))))
+    DONE;
+
+  /* Constants have already been handled already.  */
   if (TARGET_64BIT)
     {
-      rtx t = gen_reg_rtx (DImode);
-      emit_insn (gen_addsi3_extended (t, operands[1], operands[2]));
-      t = gen_lowpart (SImode, t);
-      SUBREG_PROMOTED_VAR_P (t) = 1;
-      SUBREG_PROMOTED_SET (t, SRP_SIGNED);
-      emit_move_insn (operands[0], t);
+      rtx tdest = gen_reg_rtx (DImode);
+      emit_insn (gen_addsi3_extended (tdest, operands[1], operands[2]));
+      tdest = gen_lowpart (SImode, tdest);
+      SUBREG_PROMOTED_VAR_P (tdest) = 1;
+      SUBREG_PROMOTED_SET (tdest, SRP_SIGNED);
+      emit_move_insn (operands[0], tdest);
       DONE;
     }
+
 })
 
-(define_insn "adddi3"
+(define_expand "adddi3"
+  [(set (match_operand:DI          0 "register_operand")
+	(plus:DI (match_operand:DI 1 "register_operand")
+		 (match_operand:DI 2 "reg_or_const_int_operand")))]
+  "TARGET_64BIT"
+{
+  /* We may be able to find a faster sequence, if so, then we are
+     done.  Otherwise let expansion continue normally.  */
+  if (CONST_INT_P (operands[2]) && synthesize_add (operands))
+    DONE;
+})
+
+(define_insn "*adddi3"
   [(set (match_operand:DI          0 "register_operand" "=r,r")
 	(plus:DI (match_operand:DI 1 "register_operand" " r,r")
 		 (match_operand:DI 2 "arith_operand"    " r,I")))]
@@ -2293,12 +2314,16 @@
       rtx abs_reg = gen_reg_rtx (<ANYF:MODE>mode);
       rtx coeff_reg = gen_reg_rtx (<ANYF:MODE>mode);
       rtx tmp_reg = gen_reg_rtx (<ANYF:MODE>mode);
+      rtx fflags = gen_reg_rtx (SImode);
 
       riscv_emit_move (tmp_reg, operands[1]);
       riscv_emit_move (coeff_reg,
 		       riscv_vector::get_fp_rounding_coefficient (<ANYF:MODE>mode));
       emit_insn (gen_abs<ANYF:mode>2 (abs_reg, operands[1]));
 
+      /* fp compare can set invalid flag for NaN, so backup fflags.  */
+      if (flag_trapping_math)
+        emit_insn (gen_riscv_frflags (fflags));
       riscv_expand_conditional_branch (label, LT, abs_reg, coeff_reg);
 
       emit_jump_insn (gen_jump (end_label));
@@ -2324,6 +2349,14 @@
       emit_insn (gen_copysign<ANYF:mode>3 (tmp_reg, abs_reg, operands[1]));
 
       emit_label (end_label);
+
+      /* Restore fflags, but after label.  This is slightly different
+         than glibc implementation which only needs to restore under
+         the label, since it checks for NaN first, meaning following fp
+         compare can't raise fp exceptons and thus not clobber fflags.  */
+      if (flag_trapping_math)
+        emit_insn (gen_riscv_fsflags (fflags));
+
       riscv_emit_move (operands[0], tmp_reg);
     }
 
@@ -4402,11 +4435,21 @@
 )
 
 (define_insn "prefetch"
-  [(prefetch (match_operand 0 "prefetch_operand" "Qr")
-             (match_operand 1 "imm5_operand" "i")
-             (match_operand 2 "const_int_operand" "n"))]
-  "TARGET_ZICBOP"
+  [(prefetch (match_operand 0 "prefetch_operand" "Qr,ZD")
+	     (match_operand 1 "imm5_operand" "i,i")
+	     (match_operand 2 "const_int_operand" "n,n"))]
+  "TARGET_ZICBOP || TARGET_XMIPSCBOP"
 {
+  if (TARGET_XMIPSCBOP)
+    {
+      /* Mips Prefetch write is nop for p8700.  */
+      if (operands[1] != CONST0_RTX (GET_MODE (operands[1])))
+	return "nop";
+
+      operands[1] = riscv_prefetch_cookie (operands[1], operands[2]);
+      return "mips.pref\t%1,%a0";
+    }
+
   switch (INTVAL (operands[1]))
   {
     case 0:
diff --git a/gcc/config/riscv/sifive-p400.md b/gcc/config/riscv/sifive-p400.md
index ed8b8ec..0acdbda 100644
--- a/gcc/config/riscv/sifive-p400.md
+++ b/gcc/config/riscv/sifive-p400.md
@@ -153,10 +153,13 @@
        (eq_attr "type" "fmove,fcvt"))
   "p400_float_pipe,sifive_p400_fpu")
 
+;; We need something for HF so that we don't abort during
+;; scheduling if someone was to ask for p400 scheduling, but
+;; enable the various HF mode extensions.
 (define_insn_reservation "sifive_p400_fdiv_s" 18
   (and (eq_attr "tune" "sifive_p400")
        (eq_attr "type" "fdiv,fsqrt")
-       (eq_attr "mode" "SF"))
+       (eq_attr "mode" "HF,SF"))
   "sifive_p400_FM, sifive_p400_fdiv*5")
 
 (define_insn_reservation "sifive_p400_fdiv_d" 31
@@ -178,3 +181,18 @@
 (define_bypass 1 "sifive_p400_f2i"
   "sifive_p400_branch,sifive_p400_sfb_alu,sifive_p400_mul,
    sifive_p400_div,sifive_p400_alu,sifive_p400_cpop")
+
+
+;; Someone familiar with the p400 uarch needs to put
+;; these into the right reservations.  This is just a placeholder
+;; for everything I found that had no mapping to a reservation.
+;;
+;; Note that even if the processor does not implementat a particular
+;; instruction it should still have suitable reservations, even if
+;; they are just dummies like this one.
+(define_insn_reservation "sifive_p400_unknown" 1
+  (and (eq_attr "tune" "sifive_p400")
+       (eq_attr "type" "ghost,vfrecp,vclmul,vldm,vmffs,vclmulh,vlsegde,vfcvtitof,vsm4k,vfcvtftoi,vfdiv,vsm3c,vsm4r,viwmuladd,vfwredu,vcpop,vfwmuladd,vstux,vsshift,vfwcvtftof,vfncvtftof,vfwmaccbf16,vext,vssegte,rdvl,vaeskf1,vfslide1up,vmov,vimovvx,vaesef,vfsqrt,viminmax,vfwcvtftoi,vssegtox,vfclass,viwmul,vector,vgmul,vsm3me,vfcmp,vstm,vfredo,vfwmul,vaeskf2,vstox,vfncvtbf16,vislide1up,vgather,vldox,viwred,vctz,vghsh,vsts,vslidedown,vfmerge,vicmp,vsmul,vlsegdff,vfalu,vfmov,vislide1down,vfminmax,vcompress,vldr,vldff,vlsegdux,vimuladd,vsalu,vidiv,sf_vqmacc,vfslide1down,vaesem,vimerge,vfncvtftoi,vfwcvtitof,vicalu,vaesz,sf_vc_se,vsha2cl,vmsfs,vldux,vmidx,vslideup,vired,vlde,vfwredo,vfmovfv,vbrev,vfncvtitof,rdfrm,vsetvl,vssegts,vimul,vialu,vbrev8,vfwalu,rdvlenb,sf_vfnrclip,vclz,vnclip,sf_vc,vimov,vste,vfmuladd,vfmovvf,vwsll,vsetvl_pre,vlds,vlsegds,vmiota,vmalu,wrvxrm,wrfrm,viwalu,vaesdm,vssegtux,vaesdf,vimovxv,vror,vnshift,vstr,vaalu,vsha2ms,crypto,vfwcvtbf16,vlsegdox,vrol,vandn,vfsgnj,vmpop,vfredu,vsha2ch,vshift,vrev8,vfmul"))
+  "p400_int_pipe+sifive_p400_ialu")
+
+
diff --git a/gcc/config/riscv/sifive-p600.md b/gcc/config/riscv/sifive-p600.md
index 2401349..ccd006d 100644
--- a/gcc/config/riscv/sifive-p600.md
+++ b/gcc/config/riscv/sifive-p600.md
@@ -157,10 +157,13 @@
        (eq_attr "type" "fmove,fcvt"))
   "float_pipe,sifive_p600_fpu")
 
+;; We need something for HF so that we don't abort during
+;; scheduling if someone was to ask for p600 scheduling, but
+;; enable the various HF mode extensions.
 (define_insn_reservation "sifive_p600_fdiv_s" 11
   (and (eq_attr "tune" "sifive_p600")
        (eq_attr "type" "fdiv,fsqrt")
-       (eq_attr "mode" "SF"))
+       (eq_attr "mode" "HF,SF"))
   "sifive_p600_FM, sifive_p600_fdiv*5")
 
 (define_insn_reservation "sifive_p600_fdiv_d" 19
@@ -182,3 +185,15 @@
 (define_bypass 1 "sifive_p600_f2i"
   "sifive_p600_branch,sifive_p600_sfb_alu,sifive_p600_mul,
    sifive_p600_div,sifive_p600_alu,sifive_p600_cpop")
+
+;; Someone familiar with the p600 uarch needs to put
+;; these into the right reservations.  This is just a placeholder
+;; for everything I found that had no mapping to a reservation.
+;;
+;; Note that even if the processor does not implementat a particular
+;; instruction it should still have suitable reservations, even if
+;; they are just dummies like this one.
+(define_insn_reservation "sifive_p600_unknown" 1
+  (and (eq_attr "tune" "sifive_p600")
+       (eq_attr "type" "vicmp,vssegte,vbrev8,vfwalu,vimov,vmpop,vaesdf,vislide1up,vror,vsha2cl,vrol,vslideup,vimuladd,vclmul,vaesef,vext,vlsegdff,vfmuladd,vfclass,vmsfs,vfcmp,vsmul,vsm3me,vmalu,vshift,viwmuladd,vfslide1up,vlsegde,vsm4k,wrvxrm,vislide1down,vsm3c,vfwmuladd,vaesdm,vclmulh,vfwcvtftof,vfwredu,vfredo,sf_vfnrclip,vaesz,vwsll,vmiota,vctz,vsetvl_pre,vstm,vidiv,vssegtux,vfwmul,vcompress,vste,vired,vlsegds,vaesem,vfminmax,ghost,vandn,crypto,vfmul,vialu,vfmovvf,rdfrm,vldff,vfmerge,vsshift,vnclip,sf_vqmacc,vnshift,vfdiv,vfslide1down,vfncvtitof,vfsqrt,vimovxv,vstr,vfwcvtbf16,vfwcvtitof,vbrev,vssegtox,vssegts,vcpop,vmffs,viwmul,vldr,vmidx,rdvlenb,vfalu,vslidedown,vlde,vfsgnj,vfmov,viwalu,vsha2ch,vfncvtbf16,vfcvtitof,rdvl,vsetvl,vsha2ms,vector,vstux,vimerge,vclz,sf_vc,vfcvtftoi,viminmax,vsm4r,sf_vc_se,wrfrm,vstox,vfmovfv,vfncvtftoi,vimul,vsalu,vmov,vgmul,vgather,vldux,vlsegdox,vfncvtftof,vimovvx,vghsh,vldm,vldox,vfwcvtftoi,vlds,vfrecp,vaeskf2,vsts,vfredu,vicalu,vaalu,vfwmaccbf16,vrev8,vfwredo,vlsegdux,viwred,vaeskf1"))
+  "int_pipe+sifive_p600_ialu")
diff --git a/gcc/config/riscv/sync.md b/gcc/config/riscv/sync.md
index 50ec8b3..ab6f430 100644
--- a/gcc/config/riscv/sync.md
+++ b/gcc/config/riscv/sync.md
@@ -376,7 +376,19 @@
    (match_operand:SI 3 "const_int_operand")] ;; model
   "TARGET_ZAAMO || TARGET_ZALRSC"
   {
-    if (TARGET_ZAAMO)
+    if (TARGET_ZAAMO && TARGET_64BIT && <MODE>mode == SImode)
+      {
+	rtx t = gen_reg_rtx (DImode);
+	emit_insn (gen_amo_atomic_exchange_extended (t,
+						     operands[1],
+						     operands[2],
+						     operands[3]));
+	t = gen_lowpart (SImode, t);
+	SUBREG_PROMOTED_VAR_P (t) = 1;
+	SUBREG_PROMOTED_SET (t, SRP_SIGNED);
+	emit_move_insn (operands[0], t);
+      }
+    else if (TARGET_ZAAMO)
       emit_insn (gen_amo_atomic_exchange<mode> (operands[0], operands[1],
 					    operands[2], operands[3]));
     else
@@ -386,18 +398,31 @@
   })
 
 (define_insn "amo_atomic_exchange<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=&r")
+  [(set (match_operand:GPR 0 "register_operand" "=r")
 	(unspec_volatile:GPR
 	  [(match_operand:GPR 1 "memory_operand" "+A")
 	   (match_operand:SI 3 "const_int_operand")] ;; model
 	  UNSPEC_SYNC_EXCHANGE))
    (set (match_dup 1)
-	(match_operand:GPR 2 "register_operand" "0"))]
+	(match_operand:GPR 2 "reg_or_0_operand" "rJ"))]
   "TARGET_ZAAMO"
   "amoswap.<amo>%A3\t%0,%z2,%1"
   [(set_attr "type" "atomic")
    (set (attr "length") (const_int 4))])
 
+(define_insn "amo_atomic_exchange_extended"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+    (sign_extend:DI (unspec_volatile:SI
+      [(match_operand:SI 1 "memory_operand" "+A")
+       (match_operand:SI 3 "const_int_operand")] ;; model
+      UNSPEC_SYNC_EXCHANGE)))
+   (set (match_dup 1)
+    (match_operand:SI 2 "reg_or_0_operand" "rJ"))]
+  "TARGET_64BIT && TARGET_ZAAMO"
+  "amoswap.w%A3\t%0,%z2,%1"
+  [(set_attr "type" "atomic")
+   (set (attr "length") (const_int 4))])
+
 (define_insn "lrsc_atomic_exchange<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")
 	(unspec_volatile:GPR
@@ -434,13 +459,13 @@
 })
 
 (define_insn "zabha_atomic_exchange<mode>"
-  [(set (match_operand:SHORT 0 "register_operand" "=&r")
+  [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(unspec_volatile:SHORT
 	  [(match_operand:SHORT 1 "memory_operand" "+A")
 	   (match_operand:SI 3 "const_int_operand")] ;; model
 	  UNSPEC_SYNC_EXCHANGE_ZABHA))
    (set (match_dup 1)
-	(match_operand:SHORT 2 "register_operand" "0"))]
+	(match_operand:SHORT 2 "reg_or_0_operand" "rJ"))]
   "TARGET_ZABHA"
   "amoswap.<amobh>%A3\t%0,%z2,%1"
   [(set_attr "type" "atomic")
diff --git a/gcc/config/riscv/t-rtems b/gcc/config/riscv/t-rtems
index f596e76..a4d2d03 100644
--- a/gcc/config/riscv/t-rtems
+++ b/gcc/config/riscv/t-rtems
@@ -1,8 +1,8 @@
 MULTILIB_OPTIONS	=
 MULTILIB_DIRNAMES	=
 
-MULTILIB_OPTIONS	+= march=rv32i/march=rv32iac/march=rv32im/march=rv32imf/march=rv32ima/march=rv32imac/march=rv32imaf/march=rv32imafc/march=rv32imafd/march=rv32imafdc/march=rv64ima/march=rv64imac/march=rv64imafd/march=rv64imafdc
-MULTILIB_DIRNAMES	+= rv32i       rv32iac       rv32im       rv32imf      rv32ima       rv32imac       rv32imaf       rv32imafc       rv32imafd       rv32imafdc       rv64ima       rv64imac       rv64imafd       rv64imafdc
+MULTILIB_OPTIONS	+= march=rv32i/march=rv32iac/march=rv32im/march=rv32imf/march=rv32ima/march=rv32imac/march=rv32imaf/march=rv32imafc/march=rv32imafd/march=rv32imafdc/march=rv64ima/march=rv64imac/march=rv64imafd/march=rv64imafdc/march=rv64imc
+MULTILIB_DIRNAMES	+= rv32i       rv32iac       rv32im       rv32imf       rv32ima       rv32imac       rv32imaf       rv32imafc       rv32imafd       rv32imafdc       rv64ima       rv64imac       rv64imafd       rv64imafdc       rv64imc
 
 MULTILIB_OPTIONS	+= mabi=ilp32/mabi=ilp32f/mabi=ilp32d/mabi=lp64/mabi=lp64d
 MULTILIB_DIRNAMES	+= ilp32      ilp32f      ilp32d      lp64      lp64d
@@ -10,6 +10,9 @@ MULTILIB_DIRNAMES	+= ilp32      ilp32f      ilp32d      lp64      lp64d
 MULTILIB_OPTIONS	+= mcmodel=medany
 MULTILIB_DIRNAMES	+= medany
 
+MULTILIB_OPTIONS	+= mstrict-align
+MULTILIB_DIRNAMES	+= strict-align
+
 MULTILIB_REQUIRED	=
 MULTILIB_REQUIRED	+= march=rv32i/mabi=ilp32
 MULTILIB_REQUIRED	+= march=rv32iac/mabi=ilp32
@@ -25,3 +28,5 @@ MULTILIB_REQUIRED	+= march=rv64ima/mabi=lp64/mcmodel=medany
 MULTILIB_REQUIRED	+= march=rv64imac/mabi=lp64/mcmodel=medany
 MULTILIB_REQUIRED	+= march=rv64imafd/mabi=lp64d/mcmodel=medany
 MULTILIB_REQUIRED	+= march=rv64imafdc/mabi=lp64d/mcmodel=medany
+MULTILIB_REQUIRED	+= march=rv64imafdc/mabi=lp64d/mcmodel=medany/mstrict-align
+MULTILIB_REQUIRED	+= march=rv64imc/mabi=lp64/mcmodel=medany/mstrict-align
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 66b7670..2b35d66 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1398,6 +1398,7 @@
   }
   [(set_attr "type" "vmov,vlde,vste")
    (set_attr "mode" "<VT:MODE>")
+   (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))])
 
@@ -1435,6 +1436,7 @@
   }
   [(set_attr "type" "vlde,vste,vmov")
    (set_attr "mode" "<MODE>")
+   (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))]
 )
@@ -1485,6 +1487,7 @@
 }
   [(set_attr "type" "vlde,vste,vmov")
    (set_attr "mode" "<VLS_AVL_REG:MODE>")
+   (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))]
 )
@@ -5490,6 +5493,98 @@
   "TARGET_VECTOR"
 {})
 
+(define_expand "@pred_mul_plus_vx_<mode>"
+  [(set (match_operand:V_VLSI_QHS       0 "register_operand")
+	(if_then_else:V_VLSI_QHS
+	  (unspec:<VM>
+	    [(match_operand:<VM>        1 "vector_mask_operand")
+	     (match_operand             6 "vector_length_operand")
+	     (match_operand             7 "const_int_operand")
+	     (match_operand             8 "const_int_operand")
+	     (match_operand             9 "const_int_operand")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (plus:V_VLSI_QHS
+	    (mult:V_VLSI_QHS
+	      (vec_duplicate:V_VLSI_QHS
+	        (match_operand:<VEL>    2 "register_operand"))
+	      (match_operand:V_VLSI_QHS 3 "register_operand"))
+	    (match_operand:V_VLSI_QHS   4 "register_operand"))
+	  (match_operand:V_VLSI_QHS     5 "vector_merge_operand")))]
+  "TARGET_VECTOR"
+{
+  riscv_vector::prepare_ternary_operands (operands);
+})
+
+(define_expand "@pred_mul_plus_vx_<mode>"
+  [(set (match_operand:V_VLSI_D       0 "register_operand")
+	(if_then_else:V_VLSI_D
+	  (unspec:<VM>
+	    [(match_operand:<VM>      1 "vector_mask_operand")
+	     (match_operand           6 "vector_length_operand")
+	     (match_operand           7 "const_int_operand")
+	     (match_operand           8 "const_int_operand")
+	     (match_operand           9 "const_int_operand")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (plus:V_VLSI_D
+	    (mult:V_VLSI_D
+	      (vec_duplicate:V_VLSI_D
+		(match_operand:<VEL>  2 "register_operand"))
+	      (match_operand:V_VLSI_D 3 "register_operand"))
+	    (match_operand:V_VLSI_D   4 "register_operand"))
+	  (match_operand:V_VLSI_D     5 "vector_merge_operand")))]
+  "TARGET_VECTOR && TARGET_64BIT"
+{
+  riscv_vector::prepare_ternary_operands (operands);
+})
+
+(define_expand "@pred_vnmsac_vx_<mode>"
+  [(set (match_operand:V_VLSI_QHS       0 "register_operand")
+	(if_then_else:V_VLSI_QHS
+	  (unspec:<VM>
+	    [(match_operand:<VM>        1 "vector_mask_operand")
+	     (match_operand             6 "vector_length_operand")
+	     (match_operand             7 "const_int_operand")
+	     (match_operand             8 "const_int_operand")
+	     (match_operand             9 "const_int_operand")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (minus:V_VLSI_QHS
+	    (match_operand:V_VLSI_QHS   4 "register_operand")
+	    (mult:V_VLSI_QHS
+	      (vec_duplicate:V_VLSI_QHS
+	        (match_operand:<VEL>    2 "register_operand"))
+	      (match_operand:V_VLSI_QHS 3 "register_operand")))
+	  (match_operand:V_VLSI_QHS     5 "vector_merge_operand")))]
+  "TARGET_VECTOR"
+{
+  riscv_vector::prepare_ternary_operands (operands);
+})
+
+(define_expand "@pred_vnmsac_vx_<mode>"
+  [(set (match_operand:V_VLSI_D       0 "register_operand")
+	(if_then_else:V_VLSI_D
+	  (unspec:<VM>
+	    [(match_operand:<VM>      1 "vector_mask_operand")
+	     (match_operand           6 "vector_length_operand")
+	     (match_operand           7 "const_int_operand")
+	     (match_operand           8 "const_int_operand")
+	     (match_operand           9 "const_int_operand")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (minus:V_VLSI_D
+	    (match_operand:V_VLSI_D   4 "register_operand")
+	    (mult:V_VLSI_D
+	      (vec_duplicate:V_VLSI_D
+		(match_operand:<VEL>  2 "register_operand"))
+	      (match_operand:V_VLSI_D 3 "register_operand")))
+	  (match_operand:V_VLSI_D     5 "vector_merge_operand")))]
+  "TARGET_VECTOR && TARGET_64BIT"
+{
+  riscv_vector::prepare_ternary_operands (operands);
+})
+
 (define_insn "*pred_madd<mode>_scalar"
   [(set (match_operand:V_VLSI 0 "register_operand"            "=vd, vr")
 	(if_then_else:V_VLSI
@@ -6324,8 +6419,8 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "@pred_<optab><mode>_scalar"
-  [(set (match_operand:VF 0 "register_operand"           "=vd, vd, vr, vr")
-	(if_then_else:VF
+  [(set (match_operand:V_VLSF 0 "register_operand"           "=vd, vd, vr, vr")
+	(if_then_else:V_VLSF
 	  (unspec:<VM>
 	    [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
 	     (match_operand 5 "vector_length_operand"    "rvl,rvl,rvl,rvl")
@@ -6336,11 +6431,11 @@
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)
 	     (reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE)
-	  (commutative_float_binop:VF
-	    (vec_duplicate:VF
+	  (commutative_float_binop:V_VLSF
+	    (vec_duplicate:V_VLSF
 	      (match_operand:<VEL> 4 "register_operand"  "  f,  f,  f,  f"))
-	    (match_operand:VF 3 "register_operand"       " vr, vr, vr, vr"))
-	  (match_operand:VF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
+	    (match_operand:V_VLSF 3 "register_operand"       " vr, vr, vr, vr"))
+	  (match_operand:V_VLSF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
   "TARGET_VECTOR"
   "vf<insn>.vf\t%0,%3,%4%p1"
   [(set_attr "type" "<float_insn_type>")
@@ -6349,43 +6444,43 @@
 	(symbol_ref "riscv_vector::get_frm_mode (operands[9])"))])
 
 (define_insn "@pred_<optab><mode>_scalar"
-  [(set (match_operand:VF 0 "register_operand"           "=vd, vd, vr, vr")
-	(if_then_else:VF
+  [(set (match_operand:V_VLSF 0 "register_operand"           "=vd, vd, vr, vr")
+	(if_then_else:V_VLSF
 	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
-	     (match_operand 5 "vector_length_operand"    "rvl,rvl,rvl,rvl")
-	     (match_operand 6 "const_int_operand"        "  i,  i,  i,  i")
-	     (match_operand 7 "const_int_operand"        "  i,  i,  i,  i")
-	     (match_operand 8 "const_int_operand"        "  i,  i,  i,  i")
+	    [(match_operand:<VM> 1 "vector_mask_operand"     " vm, vm,Wc1,Wc1")
+	     (match_operand 5 "vector_length_operand"        "rvl,rvl,rvl,rvl")
+	     (match_operand 6 "const_int_operand"            "  i,  i,  i,  i")
+	     (match_operand 7 "const_int_operand"            "  i,  i,  i,  i")
+	     (match_operand 8 "const_int_operand"            "  i,  i,  i,  i")
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (commutative_float_binop_nofrm:VF
-	    (vec_duplicate:VF
-	      (match_operand:<VEL> 4 "register_operand"  "  f,  f,  f,  f"))
-	    (match_operand:VF 3 "register_operand"       " vr, vr, vr, vr"))
-	  (match_operand:VF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
+	  (commutative_float_binop_nofrm:V_VLSF
+	    (vec_duplicate:V_VLSF
+	      (match_operand:<VEL> 4 "register_operand"      "  f,  f,  f,  f"))
+	    (match_operand:V_VLSF 3 "register_operand"       " vr, vr, vr, vr"))
+	  (match_operand:V_VLSF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
   "TARGET_VECTOR"
   "vf<insn>.vf\t%0,%3,%4%p1"
   [(set_attr "type" "<float_insn_type>")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "@pred_<ieee_fmaxmin_op><mode>_scalar"
-  [(set (match_operand:VF 0 "register_operand"           "=vd, vd, vr, vr")
-	(if_then_else:VF
+  [(set (match_operand:V_VLSF 0 "register_operand"         "=vd, vd, vr, vr")
+	(if_then_else:V_VLSF
 	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
-	     (match_operand 5 "vector_length_operand"    "rvl,rvl,rvl,rvl")
-	     (match_operand 6 "const_int_operand"        "  i,  i,  i,  i")
-	     (match_operand 7 "const_int_operand"        "  i,  i,  i,  i")
-	     (match_operand 8 "const_int_operand"        "  i,  i,  i,  i")
+	    [(match_operand:<VM> 1 "vector_mask_operand"   " vm, vm,Wc1,Wc1")
+	     (match_operand 5 "vector_length_operand"      "rvl,rvl,rvl,rvl")
+	     (match_operand 6 "const_int_operand"          "  i,  i,  i,  i")
+	     (match_operand 7 "const_int_operand"          "  i,  i,  i,  i")
+	     (match_operand 8 "const_int_operand"          "  i,  i,  i,  i")
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (unspec:VF
-	    [(match_operand:VF 3 "register_operand"        " vr, vr, vr, vr")
-	      (vec_duplicate:VF
+	  (unspec:V_VLSF
+	    [(match_operand:V_VLSF 3 "register_operand"    " vr, vr, vr, vr")
+	      (vec_duplicate:V_VLSF
 		(match_operand:<VEL> 4 "register_operand"  "  f,  f,  f,  f"))]
 	      UNSPEC_VFMAXMIN)
-	  (match_operand:VF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
+	  (match_operand:V_VLSF 2 "vector_merge_operand"   " vu,  0, vu,  0")))]
   "TARGET_VECTOR"
   "v<ieee_fmaxmin_op>.vf\t%0,%3,%4%p1"
   [(set_attr "type" "vfminmax")
@@ -6417,8 +6512,8 @@
 	(symbol_ref "riscv_vector::get_frm_mode (operands[9])"))])
 
 (define_insn "@pred_<optab><mode>_reverse_scalar"
-  [(set (match_operand:VF 0 "register_operand"           "=vd, vd, vr, vr")
-	(if_then_else:VF
+  [(set (match_operand:V_VLSF 0 "register_operand"       "=vd, vd, vr, vr")
+	(if_then_else:V_VLSF
 	  (unspec:<VM>
 	    [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
 	     (match_operand 5 "vector_length_operand"    "rvl,rvl,rvl,rvl")
@@ -6429,11 +6524,11 @@
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)
 	     (reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE)
-	  (non_commutative_float_binop:VF
-	    (vec_duplicate:VF
+	  (non_commutative_float_binop:V_VLSF
+	    (vec_duplicate:V_VLSF
 	      (match_operand:<VEL> 4 "register_operand"  "  f,  f,  f,  f"))
-	    (match_operand:VF 3 "register_operand"       " vr, vr, vr, vr"))
-	  (match_operand:VF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
+	    (match_operand:V_VLSF 3 "register_operand"   " vr, vr, vr, vr"))
+	  (match_operand:V_VLSF 2 "vector_merge_operand" " vu,  0, vu,  0")))]
   "TARGET_VECTOR"
   "vfr<insn>.vf\t%0,%3,%4%p1"
   [(set_attr "type" "<float_insn_type>")
@@ -8839,6 +8934,106 @@
   [(set_attr "type" "vssegt<order>x")
    (set_attr "mode" "<V32T:MODE>")])
 
+(define_insn "*pred_macc_<mode>_scalar_undef"
+  [(set (match_operand:V_VLSI_QHS       0 "register_operand"      "=vd,  vr")
+	(if_then_else:V_VLSI_QHS
+	  (unspec:<VM>
+	    [(match_operand:<VM>        1 "vector_mask_operand"   " vm, Wc1")
+	     (match_operand             6 "vector_length_operand" "rvl, rvl")
+	     (match_operand             7 "const_int_operand"     "  i,   i")
+	     (match_operand             8 "const_int_operand"     "  i,   i")
+	     (match_operand             9 "const_int_operand"     "  i,   i")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (plus:V_VLSI_QHS
+	    (mult:V_VLSI_QHS
+	      (vec_duplicate:V_VLSI_QHS
+		(match_operand:<VEL>    3 "reg_or_0_operand"     "  rJ,  rJ"))
+	      (match_operand:V_VLSI_QHS 4 "register_operand"     "  vr,  vr"))
+	    (match_operand:V_VLSI_QHS   5 "register_operand"     "   0,   0"))
+	  (match_operand:V_VLSI_QHS     2 "vector_undef_operand")))]
+  "TARGET_VECTOR"
+  "@
+   vmacc.vx\t%0,%z3,%4%p1
+   vmacc.vx\t%0,%z3,%4%p1"
+  [(set_attr "type" "vimuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*pred_macc_<mode>_scalar_undef"
+  [(set (match_operand:V_VLSI_D       0 "register_operand"      "=vd,  vr")
+	(if_then_else:V_VLSI_D
+	  (unspec:<VM>
+	    [(match_operand:<VM>      1 "vector_mask_operand"   " vm, Wc1")
+	     (match_operand           6 "vector_length_operand" "rvl, rvl")
+	     (match_operand           7 "const_int_operand"     "  i,   i")
+	     (match_operand           8 "const_int_operand"     "  i,   i")
+	     (match_operand           9 "const_int_operand"     "  i,   i")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (plus:V_VLSI_D
+	    (mult:V_VLSI_D
+	      (vec_duplicate:V_VLSI_D
+		(match_operand:<VEL>  3 "reg_or_0_operand"     "  rJ,  rJ"))
+	      (match_operand:V_VLSI_D 4 "register_operand"     "  vr,  vr"))
+	    (match_operand:V_VLSI_D   5 "register_operand"     "   0,   0"))
+	  (match_operand:V_VLSI_D     2 "vector_undef_operand")))]
+  "TARGET_VECTOR && TARGET_64BIT"
+  "@
+   vmacc.vx\t%0,%z3,%4%p1
+   vmacc.vx\t%0,%z3,%4%p1"
+  [(set_attr "type" "vimuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*pred_nmsac_<mode>_scalar_undef"
+  [(set (match_operand:V_VLSI_QHS       0 "register_operand"      "=vd,  vr")
+	(if_then_else:V_VLSI_QHS
+	  (unspec:<VM>
+	    [(match_operand:<VM>        1 "vector_mask_operand"   " vm, Wc1")
+	     (match_operand             6 "vector_length_operand" "rvl, rvl")
+	     (match_operand             7 "const_int_operand"     "  i,   i")
+	     (match_operand             8 "const_int_operand"     "  i,   i")
+	     (match_operand             9 "const_int_operand"     "  i,   i")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (minus:V_VLSI_QHS
+	    (match_operand:V_VLSI_QHS   5 "register_operand"     "   0,   0")
+	    (mult:V_VLSI_QHS
+	      (vec_duplicate:V_VLSI_QHS
+		(match_operand:<VEL>    3 "reg_or_0_operand"     "  rJ,  rJ"))
+	      (match_operand:V_VLSI_QHS 4 "register_operand"     "  vr,  vr")))
+	  (match_operand:V_VLSI_QHS     2 "vector_undef_operand")))]
+  "TARGET_VECTOR"
+  "@
+   vnmsac.vx\t%0,%z3,%4%p1
+   vnmsac.vx\t%0,%z3,%4%p1"
+  [(set_attr "type" "vimuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*pred_nmsac_<mode>_scalar_undef"
+  [(set (match_operand:V_VLSI_D       0 "register_operand"      "=vd,  vr")
+	(if_then_else:V_VLSI_D
+	  (unspec:<VM>
+	    [(match_operand:<VM>      1 "vector_mask_operand"   " vm, Wc1")
+	     (match_operand           6 "vector_length_operand" "rvl, rvl")
+	     (match_operand           7 "const_int_operand"     "  i,   i")
+	     (match_operand           8 "const_int_operand"     "  i,   i")
+	     (match_operand           9 "const_int_operand"     "  i,   i")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (minus:V_VLSI_D
+	    (match_operand:V_VLSI_D   5 "register_operand"     "   0,   0")
+	    (mult:V_VLSI_D
+	      (vec_duplicate:V_VLSI_D
+		(match_operand:<VEL>  3 "reg_or_0_operand"     "  rJ,  rJ"))
+	      (match_operand:V_VLSI_D 4 "register_operand"     "  vr,  vr")))
+	  (match_operand:V_VLSI_D     2 "vector_undef_operand")))]
+  "TARGET_VECTOR && TARGET_64BIT"
+  "@
+   vnmsac.vx\t%0,%z3,%4%p1
+   vnmsac.vx\t%0,%z3,%4%p1"
+  [(set_attr "type" "vimuladd")
+   (set_attr "mode" "<MODE>")])
+
 (include "autovec.md")
 (include "autovec-opt.md")
 (include "sifive-vector.md")
diff --git a/gcc/config/riscv/xiangshan.md b/gcc/config/riscv/xiangshan.md
index 34b4a8f..6179140 100644
--- a/gcc/config/riscv/xiangshan.md
+++ b/gcc/config/riscv/xiangshan.md
@@ -144,13 +144,13 @@
 (define_insn_reservation "xiangshan_sfdiv" 11
   (and (eq_attr "tune" "xiangshan")
        (eq_attr "type" "fdiv")
-       (eq_attr "mode" "SF"))
+       (eq_attr "mode" "HF,SF"))
   "xs_fmisc_rs")
 
 (define_insn_reservation "xiangshan_sfsqrt" 17
   (and (eq_attr "tune" "xiangshan")
        (eq_attr "type" "fsqrt")
-       (eq_attr "mode" "SF"))
+       (eq_attr "mode" "HF,SF"))
   "xs_fmisc_rs")
 
 (define_insn_reservation "xiangshan_dfdiv" 21
diff --git a/gcc/config/rl78/rl78.opt.urls b/gcc/config/rl78/rl78.opt.urls
index 96eff5f..66e874b 100644
--- a/gcc/config/rl78/rl78.opt.urls
+++ b/gcc/config/rl78/rl78.opt.urls
@@ -4,7 +4,7 @@ msim
 UrlSuffix(gcc/RL78-Options.html#index-msim-6)
 
 mmul=
-UrlSuffix(gcc/RL78-Options.html#index-mmul)
+UrlSuffix(gcc/RL78-Options.html#index-mmul-1)
 
 mallregs
 UrlSuffix(gcc/RL78-Options.html#index-mallregs)
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 764b499..8dd23f8 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -10322,7 +10322,7 @@ can_be_rotated_to_negative_lis (HOST_WIDE_INT c, int *rot)
      rotated over the highest bit.  */
   unsigned HOST_WIDE_INT uc = c;
   int pos_one = clz_hwi ((HOST_WIDE_INT) (uc << 16) >> 16);
-  if (pos_one != 0)
+  if (pos_one > 0 && pos_one < HOST_BITS_PER_WIDE_INT)
     {
       middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one));
       int middle_ones = clz_hwi (~(uc << pos_one));
@@ -10585,7 +10585,7 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c, int *num_insns)
     {
       /* li/lis; rldicX */
       unsigned HOST_WIDE_INT imm = (c | ~mask);
-      if (shift != 0)
+      if (shift > 0 && shift < HOST_BITS_PER_WIDE_INT)
 	imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift));
 
       count_or_emit_insn (temp, GEN_INT (imm));
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index e31ee40..04a6c0f 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -15665,10 +15665,10 @@
 	(if_then_else:SI (lt (match_dup 3)
 			     (const_int 0))
 			 (const_int -1)
-			 (if_then_else (gt (match_dup 3)
-					   (const_int 0))
-				       (const_int 1)
-				       (const_int 0))))]
+			 (if_then_else:SI (gt (match_dup 3)
+					      (const_int 0))
+					  (const_int 1)
+					  (const_int 0))))]
   "TARGET_P9_MISC"
 {
   operands[3] = gen_reg_rtx (CCmode);
@@ -15703,10 +15703,10 @@
 	 (if_then_else:SI (lt (match_operand:CC 1 "cc_reg_operand" "y")
 			      (const_int 0))
 			  (const_int -1)
-			  (if_then_else (gt (match_dup 1)
-					    (const_int 0))
-					(const_int 1)
-					(const_int 0))))]
+			  (if_then_else:SI (gt (match_dup 1)
+					       (const_int 0))
+					   (const_int 1)
+					   (const_int 0))))]
   "TARGET_P9_MISC"
   "setb %0,%1"
   [(set_attr "type" "logical")])
@@ -15716,10 +15716,10 @@
 	 (if_then_else:SI (ltu (match_operand:CCUNS 1 "cc_reg_operand" "y")
 			      (const_int 0))
 			  (const_int -1)
-			  (if_then_else (gtu (match_dup 1)
-					    (const_int 0))
-					(const_int 1)
-					(const_int 0))))]
+			  (if_then_else:SI (gtu (match_dup 1)
+						(const_int 0))
+					   (const_int 1)
+					   (const_int 0))))]
   "TARGET_P9_MISC"
   "setb %0,%1"
   [(set_attr "type" "logical")])
@@ -15751,10 +15751,10 @@
 	(if_then_else:SI (lt (match_dup 3)
 			     (const_int 0))
 			 (const_int -1)
-			 (if_then_else (gt (match_dup 3)
-					   (const_int 0))
-				       (const_int 1)
-				       (const_int 0))))]
+			 (if_then_else:SI (gt (match_dup 3)
+					      (const_int 0))
+					  (const_int 1)
+					  (const_int 0))))]
   "TARGET_P9_MISC"
 {
   operands[3] = gen_reg_rtx (CCmode);
@@ -15807,10 +15807,10 @@
 	(if_then_else:SI (lt (match_dup 3)
 			     (const_int 0))
 			 (const_int -1)
-			 (if_then_else (gt (match_dup 3)
-					   (const_int 0))
-				       (const_int 1)
-				       (const_int 0))))]
+			 (if_then_else:SI (gt (match_dup 3)
+					      (const_int 0))
+					  (const_int 1)
+					  (const_int 0))))]
   "TARGET_P9_MISC && TARGET_64BIT"
 {
   operands[3] = gen_reg_rtx (CCmode);
diff --git a/gcc/config/rx/rx.cc b/gcc/config/rx/rx.cc
index dd730dc..c563881 100644
--- a/gcc/config/rx/rx.cc
+++ b/gcc/config/rx/rx.cc
@@ -1648,16 +1648,20 @@ mark_frame_related (rtx insn)
 static void
 add_pop_cfi_notes (rtx_insn *insn, unsigned int high, unsigned int low)
 {
-  rtx t = plus_constant (Pmode, stack_pointer_rtx,
-                        (high - low + 1) * UNITS_PER_WORD);
+  rtx src = stack_pointer_rtx;
+  rtx t;
+  for (unsigned int i = low; i <= high; i++)
+    {
+      add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (word_mode, i));
+      if (i == FRAME_POINTER_REGNUM && frame_pointer_needed)
+	src = frame_pointer_rtx;
+    }
+  t = plus_constant (Pmode, src, (high - low + 1) * UNITS_PER_WORD);
   t = gen_rtx_SET (stack_pointer_rtx, t);
   add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
   RTX_FRAME_RELATED_P (insn) = 1;
-  for (unsigned int i = low; i <= high; i++)
-    add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (word_mode, i));
 }
 
-
 static bool
 ok_for_max_constant (HOST_WIDE_INT val)
 {
@@ -1816,36 +1820,17 @@ rx_expand_prologue (void)
 	}
     }
 
-  /* If needed, set up the frame pointer.  */
-  if (frame_pointer_needed)
-    gen_safe_add (frame_pointer_rtx, stack_pointer_rtx,
-		  GEN_INT (- (HOST_WIDE_INT) frame_size), true);
-
-  /* Allocate space for the outgoing args.
-     If the stack frame has not already been set up then handle this as well.  */
-  if (stack_size)
+  if (stack_size || frame_size)
     {
-      if (frame_size)
-	{
-	  if (frame_pointer_needed)
-	    gen_safe_add (stack_pointer_rtx, frame_pointer_rtx,
-			  GEN_INT (- (HOST_WIDE_INT) stack_size), true);
-	  else
-	    gen_safe_add (stack_pointer_rtx, stack_pointer_rtx,
-			  GEN_INT (- (HOST_WIDE_INT) (frame_size + stack_size)),
-			  true);
-	}
-      else
-	gen_safe_add (stack_pointer_rtx, stack_pointer_rtx,
-		      GEN_INT (- (HOST_WIDE_INT) stack_size), true);
+      gen_safe_add (stack_pointer_rtx, stack_pointer_rtx,
+		    GEN_INT (- (HOST_WIDE_INT) (stack_size + frame_size)),
+		    true);
     }
-  else if (frame_size)
+  if (frame_pointer_needed)
     {
-      if (! frame_pointer_needed)
-	gen_safe_add (stack_pointer_rtx, stack_pointer_rtx,
-		      GEN_INT (- (HOST_WIDE_INT) frame_size), true);
-      else
-	gen_safe_add (stack_pointer_rtx, frame_pointer_rtx, NULL_RTX, true);
+      gen_safe_add (frame_pointer_rtx, stack_pointer_rtx,
+		    GEN_INT ((HOST_WIDE_INT) stack_size),
+		    true);
     }
 }
 
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index d044f9a..1a47f47 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -8318,7 +8318,7 @@ s390_expand_int_spaceship (rtx op0, rtx op1, rtx op2, rtx op3)
 }
 
 /* Expand floating-point op0 = op1 <=> op2, i.e.,
-   op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : op1 > op2 ? 1 : 2.
+   op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : op1 > op2 ? 1 : -128.
 
    If op3 equals const0_rtx, then we are interested in the compare only (see
    test spaceship-fp-4.c).  Otherwise, op3 is a CONST_INT different than
@@ -8368,7 +8368,7 @@ s390_expand_fp_spaceship (rtx op0, rtx op1, rtx op2, rtx op3)
     {
       emit_jump (l_end);
       emit_label (l_unordered);
-      rtx unord_val = op3 == const0_rtx ? const2_rtx : op3;
+      rtx unord_val = op3 == const0_rtx ? GEN_INT (-128) : op3;
       emit_move_insn (op0, unord_val);
     }
   emit_label (l_end);
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 8cc48b0..858387c 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -5248,18 +5248,19 @@
 })
 
 (define_insn "*zero_extendsidi2"
-  [(set (match_operand:DI 0 "register_operand" "=d,d,d")
-        (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "d,T,b")))]
+  [(set (match_operand:DI 0 "register_operand" "=d,d,d,d")
+        (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "d,T,b,v")))]
   "TARGET_ZARCH"
   "@
    llgfr\t%0,%1
    llgf\t%0,%1
-   llgfrl\t%0,%1"
-  [(set_attr "op_type"      "RRE,RXY,RIL")
-   (set_attr "type"         "*,*,larl")
-   (set_attr "cpu_facility" "*,*,z10")
-   (set_attr "z10prop" "z10_fwd_E1,z10_fwd_A3,z10_fwd_A3")
-   (set_attr "relative_long" "*,*,yes")])
+   llgfrl\t%0,%1
+   vlgvf\t%0,%v1,0"
+  [(set_attr "op_type"      "RRE,RXY,RIL,VRS")
+   (set_attr "type"         "*,*,larl,*")
+   (set_attr "cpu_facility" "*,*,z10,vx")
+   (set_attr "z10prop" "z10_fwd_E1,z10_fwd_A3,z10_fwd_A3,*")
+   (set_attr "relative_long" "*,*,yes,*")])
 
 ;
 ; LLGT-type instructions (zero-extend from 31 bit to 64 bit).
@@ -5362,29 +5363,32 @@
 
 ; llhrl, llghrl
 (define_insn "*zero_extendhi<mode>2_z10"
-  [(set (match_operand:GPR 0 "register_operand" "=d,d,d")
-        (zero_extend:GPR (match_operand:HI 1 "nonimmediate_operand" "d,T,b")))]
+  [(set (match_operand:GPR 0 "register_operand" "=d,d,d,d")
+        (zero_extend:GPR (match_operand:HI 1 "nonimmediate_operand" "d,T,b,v")))]
   "TARGET_Z10"
   "@
    ll<g>hr\t%0,%1
    ll<g>h\t%0,%1
-   ll<g>hrl\t%0,%1"
-  [(set_attr "op_type"      "RXY,RRE,RIL")
-   (set_attr "type"         "*,*,larl")
-   (set_attr "cpu_facility" "*,*,z10")
-   (set_attr "z10prop" "z10_super_E1,z10_fwd_A3,z10_fwd_A3")
-   (set_attr "relative_long" "*,*,yes")])
+   ll<g>hrl\t%0,%1
+   vlgvh\t%0,%v1,0"
+  [(set_attr "op_type"      "RXY,RRE,RIL,VRS")
+   (set_attr "type"         "*,*,larl,*")
+   (set_attr "cpu_facility" "*,*,z10,vx")
+   (set_attr "z10prop" "z10_super_E1,z10_fwd_A3,z10_fwd_A3,*")
+   (set_attr "relative_long" "*,*,yes,*")])
 
 ; llhr, llcr, llghr, llgcr, llh, llc, llgh, llgc
 (define_insn "*zero_extend<HQI:mode><GPR:mode>2_extimm"
-  [(set (match_operand:GPR 0 "register_operand" "=d,d")
-        (zero_extend:GPR (match_operand:HQI 1 "nonimmediate_operand" "d,T")))]
+  [(set (match_operand:GPR 0 "register_operand" "=d,d,d")
+        (zero_extend:GPR (match_operand:HQI 1 "nonimmediate_operand" "d,T,v")))]
   "TARGET_EXTIMM"
   "@
    ll<g><hc>r\t%0,%1
-   ll<g><hc>\t%0,%1"
-  [(set_attr "op_type" "RRE,RXY")
-   (set_attr "z10prop" "z10_super_E1,z10_fwd_A3")])
+   ll<g><hc>\t%0,%1
+   vlgv<HQI:bhfgq>\t%0,%v1,0"
+  [(set_attr "op_type" "RRE,RXY,VRS")
+   (set_attr "cpu_facility" "*,*,vx")
+   (set_attr "z10prop" "z10_super_E1,z10_fwd_A3,*")])
 
 ; llgh, llgc
 (define_insn "*zero_extend<HQI:mode><GPR:mode>2"
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 12bbeb6..745634e 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -501,54 +501,6 @@
                         SIL,SIL,RI,RI,RRE,RRE,RIL,RR,RXY,RXY,RIL")])
 
 
-; Instructions vlgvb, vlgvh, vlgvf zero all remaining bits of a GPR, i.e.,
-; an implicit zero extend is done.
-
-(define_insn "*movdi<mode>_zero_extend_A"
-  [(set (match_operand:DI 0 "register_operand" "=d")
-	(zero_extend:DI (match_operand:SINT 1 "register_operand" "v")))]
-  "TARGET_VX"
-  "vlgv<bhfgq>\t%0,%v1,0"
-  [(set_attr "op_type" "VRS")])
-
-(define_insn "*movsi<mode>_zero_extend_A"
-  [(set (match_operand:SI 0 "register_operand" "=d")
-	(zero_extend:SI (match_operand:HQI 1 "register_operand" "v")))]
-  "TARGET_VX"
-  "vlgv<bhfgq>\t%0,%v1,0"
-  [(set_attr "op_type" "VRS")])
-
-(define_mode_iterator VLGV_DI [V1QI V2QI V4QI V8QI V16QI
-			       V1HI V2HI V4HI V8HI
-			       V1SI V2SI V4SI])
-(define_insn "*movdi<mode>_zero_extend_B"
-  [(set (match_operand:DI 0 "register_operand" "=d")
-	(zero_extend:DI (vec_select:<non_vec>
-			  (match_operand:VLGV_DI 1 "register_operand" "v")
-			  (parallel [(match_operand:SI 2 "const_int_operand" "n")]))))]
-  "TARGET_VX"
-{
-  operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1));
-  return "vlgv<bhfgq>\t%0,%v1,%Y2";
-}
-  [(set_attr "op_type" "VRS")
-   (set_attr "mnemonic" "vlgv<bhfgq>")])
-
-(define_mode_iterator VLGV_SI [V1QI V2QI V4QI V8QI V16QI
-			       V1HI V2HI V4HI V8HI])
-(define_insn "*movsi<mode>_zero_extend_B"
-  [(set (match_operand:SI 0 "register_operand" "=d")
-	(zero_extend:SI (vec_select:<non_vec>
-			  (match_operand:VLGV_SI 1 "register_operand" "v")
-			  (parallel [(match_operand:SI 2 "const_int_operand" "n")]))))]
-  "TARGET_VX"
-{
-  operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1));
-  return "vlgv<bhfgq>\t%0,%v1,%Y2";
-}
-  [(set_attr "op_type" "VRS")
-   (set_attr "mnemonic" "vlgv<bhfgq>")])
-
 ; vec_load_lanes?
 
 ; vec_store_lanes?
@@ -763,6 +715,42 @@
   DONE;
 })
 
+; Instructions vlgvb, vlgvh, vlgvf zero all remaining bits of a GPR, i.e.,
+; an implicit zero extend is done.
+
+(define_mode_iterator VLGV_DI [V1QI V2QI V4QI V8QI V16QI
+			       V1HI V2HI V4HI V8HI
+			       V1SI V2SI V4SI])
+(define_insn "*vec_extract<mode>_zero_extend"
+  [(set (match_operand:DI 0 "register_operand" "=d")
+	(zero_extend:DI (vec_select:<non_vec>
+			  (match_operand:VLGV_DI 1 "register_operand" "v")
+			  (parallel [(match_operand:SI 2 "nonmemory_operand" "an")]))))]
+  "TARGET_VX"
+{
+  if (CONST_INT_P (operands[2]))
+    operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1));
+  return "vlgv<bhfgq>\t%0,%v1,%Y2";
+}
+  [(set_attr "op_type" "VRS")
+   (set_attr "mnemonic" "vlgv<bhfgq>")])
+
+(define_mode_iterator VLGV_SI [V1QI V2QI V4QI V8QI V16QI
+			       V1HI V2HI V4HI V8HI])
+(define_insn "*vec_extract<mode>_zero_extend"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(zero_extend:SI (vec_select:<non_vec>
+			  (match_operand:VLGV_SI 1 "register_operand" "v")
+			  (parallel [(match_operand:SI 2 "nonmemory_operand" "an")]))))]
+  "TARGET_VX"
+{
+  if (CONST_INT_P (operands[2]))
+    operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1));
+  return "vlgv<bhfgq>\t%0,%v1,%Y2";
+}
+  [(set_attr "op_type" "VRS")
+   (set_attr "mnemonic" "vlgv<bhfgq>")])
+
 (define_insn "*vec_vllezlf<mode>"
   [(set (match_operand:V_HW_4              0 "register_operand" "=v")
 	(vec_concat:V_HW_4
diff --git a/gcc/config/xtensa/constraints.md b/gcc/config/xtensa/constraints.md
index 77c9571..727ec1e 100644
--- a/gcc/config/xtensa/constraints.md
+++ b/gcc/config/xtensa/constraints.md
@@ -130,7 +130,7 @@
  (and (match_code "mem")
       (match_test "smalloffset_mem_p (op)")))
 
-(define_memory_constraint "T"
+(define_special_memory_constraint "T"
  "Memory in a literal pool (addressable with an L32R instruction)."
  (and (match_code "mem")
       (match_test "!TARGET_CONST16 && constantpool_mem_p (op)")))
diff --git a/gcc/config/xtensa/predicates.md b/gcc/config/xtensa/predicates.md
index 9aeaba6..20160a4 100644
--- a/gcc/config/xtensa/predicates.md
+++ b/gcc/config/xtensa/predicates.md
@@ -189,6 +189,9 @@
 (define_predicate "ubranch_operator"
   (match_code "ltu,geu"))
 
+(define_predicate "alt_ubranch_operator"
+  (match_code "gtu,leu"))
+
 (define_predicate "boolean_operator"
   (match_code "eq,ne"))
 
diff --git a/gcc/config/xtensa/xtensa-protos.h b/gcc/config/xtensa/xtensa-protos.h
index 1f5dcf5..98e75c6 100644
--- a/gcc/config/xtensa/xtensa-protos.h
+++ b/gcc/config/xtensa/xtensa-protos.h
@@ -60,6 +60,7 @@ extern bool xtensa_tls_referenced_p (rtx);
 extern enum rtx_code xtensa_shlrd_which_direction (rtx, rtx);
 extern bool xtensa_split1_finished_p (void);
 extern void xtensa_split_DI_reg_imm (rtx *);
+extern char *xtensa_bswapsi2_output (rtx_insn *, const char *);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, int);
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index d75cba4..f3b89de 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -2645,6 +2645,94 @@ xtensa_split_DI_reg_imm (rtx *operands)
 }
 
 
+/* Return the asm output string of bswapsi2_internal insn pattern.
+   It does this by scanning backwards for the BB from the specified insn,
+   and if an another bswapsi2_internal is found, it omits the instruction
+   to set SAR to 8. If not found, or if a CALL, JUMP, ASM, or other insn
+   that clobbers SAR is found first, prepend an instruction to set SAR to
+   8 as usual.  */
+
+static int
+xtensa_bswapsi2_output_1 (rtx_insn *insn)
+{
+  int icode;
+  rtx pat;
+  const char *iname;
+
+  /* CALL insn do not preserve SAR.
+     JUMP insn only appear at the end of BB, so they do not need to be
+     considered when scanning backwards.  */
+  if (CALL_P (insn))
+    return -1;
+
+  switch (icode = INSN_CODE (insn))
+    {
+    /* rotate insns clobber SAR.  */
+    case CODE_FOR_rotlsi3:
+    case CODE_FOR_rotrsi3:
+      return -1;
+    /* simple shift insns clobber SAR if non-immediate shift amounts.  */
+    case CODE_FOR_ashlsi3_internal:
+    case CODE_FOR_ashrsi3:
+    case CODE_FOR_lshrsi3:
+      if (! CONST_INT_P (XEXP (SET_SRC (PATTERN (insn)), 1)))
+	return -1;
+      break;
+    /* this insn always set SAR to 8.  */
+    case CODE_FOR_bswapsi2_internal:
+      return 1;
+    default:
+      break;
+    }
+
+  /* "*shift_per_byte" and "*shlrd_*" complex shift insns clobber SAR.  */
+  if (icode >= CODE_FOR_nothing
+      && (! strcmp (iname = insn_data[icode].name, "*shift_per_byte")
+	  || ! strncmp (iname, "*shlrd_", 7)))
+    return -1;
+
+  /* asm statements may also clobber SAR, so they are anything goes.  */
+  if (NONJUMP_INSN_P (insn))
+    switch (GET_CODE (pat = PATTERN (insn)))
+      {
+      case SET:
+	return GET_CODE (SET_SRC (pat)) == ASM_OPERANDS ? -1 : 0;
+      case PARALLEL:
+	return (GET_CODE (pat = XVECEXP (pat, 0, 0)) == SET
+		&& GET_CODE (SET_SRC (pat)) == ASM_OPERANDS)
+	       || GET_CODE (pat) == ASM_OPERANDS
+	       || GET_CODE (pat) == ASM_INPUT ? -1 : 0;
+      case ASM_OPERANDS:
+	return -1;
+      default:
+	break;
+    }
+
+  /* All other insns are not interested in SAR.  */
+  return 0;
+}
+
+char *
+xtensa_bswapsi2_output (rtx_insn *insn, const char *output)
+{
+  static char result[128];
+  int i;
+
+  strcpy (result, "ssai\t8\n\t");
+  while ((insn = prev_nonnote_nondebug_insn_bb (insn)))
+    if ((i = xtensa_bswapsi2_output_1 (insn)) < 0)
+      break;
+    else if (i > 0)
+      {
+	result[0] = '\0';
+	break;
+      }
+  strcat (result, output);
+
+  return result;
+}
+
+
 /* Try to split an integer value into what are suitable for two consecutive
    immediate addition instructions, ADDI or ADDMI.  */
 
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 629dfdd..52ffb16 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -88,6 +88,7 @@
 ;; This mode iterator allows the HI and QI patterns to be defined from
 ;; the same template.
 (define_mode_iterator HQI [HI QI])
+(define_mode_attr mode_bits [(HI "16") (QI "8")])
 
 ;; This mode iterator allows the SI and HI patterns to be defined from
 ;; the same template.
@@ -176,19 +177,18 @@
 ;; Addition.
 
 (define_insn "addsi3"
-  [(set (match_operand:SI 0 "register_operand" "=D,D,a,a,a")
-	(plus:SI (match_operand:SI 1 "register_operand" "%d,d,r,r,r")
-		 (match_operand:SI 2 "add_operand" "d,O,r,J,N")))]
-  ""
-  "@
-   add.n\t%0, %1, %2
-   addi.n\t%0, %1, %d2
-   add\t%0, %1, %2
-   addi\t%0, %1, %d2
-   addmi\t%0, %1, %x2"
-  [(set_attr "type"	"arith,arith,arith,arith,arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"2,2,3,3,3")])
+  [(set (match_operand:SI 0 "register_operand")
+	(plus:SI (match_operand:SI 1 "register_operand")
+		 (match_operand:SI 2 "add_operand")))]
+  ""
+  {@ [cons: =0, %1, 2; attrs: type, length]
+     [D, d, d; arith, 2] add.n\t%0, %1, %2
+     [D, d, O; arith, 2] addi.n\t%0, %1, %d2
+     [a, r, r; arith, 3] add\t%0, %1, %2
+     [a, r, J; arith, 3] addi\t%0, %1, %d2
+     [a, r, N; arith, 3] addmi\t%0, %1, %x2
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn "*addsubx"
   [(set (match_operand:SI 0 "register_operand" "=a")
@@ -392,18 +392,15 @@
    (set_attr "length"	"3")])
 
 (define_insn "<u>mulhisi3"
-  [(set (match_operand:SI 0 "register_operand" "=C,A")
-	(mult:SI (any_extend:SI
-		  (match_operand:HI 1 "register_operand" "%r,r"))
-		 (any_extend:SI
-		  (match_operand:HI 2 "register_operand" "r,r"))))]
+  [(set (match_operand:SI 0 "register_operand")
+	(mult:SI (any_extend:SI (match_operand:HI 1 "register_operand"))
+		 (any_extend:SI (match_operand:HI 2 "register_operand"))))]
   "TARGET_MUL16 || TARGET_MAC16"
-  "@
-   mul16<su>\t%0, %1, %2
-   <u>mul.aa.ll\t%1, %2"
-  [(set_attr "type"	"mul16,mac16")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
+  {@ [cons: =0, %1, 2; attrs: type, length]
+     [C, r, r; mul16, 3] mul16<su>\t%0, %1, %2
+     [A, r, r; mac16, 3] <u>mul.aa.ll\t%1, %2
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn "muladdhisi"
   [(set (match_operand:SI 0 "register_operand" "=A")
@@ -652,36 +649,15 @@
 })
 
 (define_insn "bswapsi2_internal"
-  [(set (match_operand:SI 0 "register_operand" "=a,&a")
-	(bswap:SI (match_operand:SI 1 "register_operand" "0,r")))
-   (clobber (match_scratch:SI 2 "=&a,X"))]
+  [(set (match_operand:SI 0 "register_operand")
+	(bswap:SI (match_operand:SI 1 "register_operand")))
+   (clobber (match_scratch:SI 2))]
   "!optimize_debug && optimize > 1 && !optimize_size"
-{
-  rtx_insn *prev_insn = prev_nonnote_nondebug_insn (insn);
-  const char *init = "ssai\t8\;";
-  static char result[128];
-  if (prev_insn && NONJUMP_INSN_P (prev_insn))
-    {
-      rtx x = PATTERN (prev_insn);
-      if (GET_CODE (x) == PARALLEL && XVECLEN (x, 0) == 2
-	  && GET_CODE (XVECEXP (x, 0, 0)) == SET
-	  && GET_CODE (XVECEXP (x, 0, 1)) == CLOBBER)
-	{
-	  x = XEXP (XVECEXP (x, 0, 0), 1);
-	  if (GET_CODE (x) == BSWAP && GET_MODE (x) == SImode)
-	    init = "";
-	}
-    }
-  sprintf (result,
-	   (which_alternative == 0)
-	   ? "%s" "srli\t%%2, %%1, 16\;src\t%%2, %%2, %%1\;src\t%%2, %%2, %%2\;src\t%%0, %%1, %%2"
-	   : "%s" "srli\t%%0, %%1, 16\;src\t%%0, %%0, %%1\;src\t%%0, %%0, %%0\;src\t%%0, %%1, %%0",
-	   init);
-  return result;
-}
-   [(set_attr "type"	"arith,arith")
-    (set_attr "mode"	"SI")
-    (set_attr "length"	"15,15")])
+  {@ [cons: =0, 1, =2; attrs: type, length]
+     [ a, 0, &a; arith, 15] << xtensa_bswapsi2_output (insn, "srli\t%2, %1, 16\;src\t%2, %2, %1\;src\t%2, %2, %2\;src\t%0, %1, %2");
+     [&a, r,  X; arith, 15] << xtensa_bswapsi2_output (insn, "srli\t%0, %1, 16\;src\t%0, %0, %1\;src\t%0, %0, %0\;src\t%0, %1, %0");
+  }
+  [(set_attr "mode" "SI")])
 
 (define_expand "bswapdi2"
   [(set (match_operand:DI 0 "register_operand" "")
@@ -742,16 +718,15 @@
 ;; Logical instructions.
 
 (define_insn "andsi3"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(and:SI (match_operand:SI 1 "register_operand" "%r,r")
-		(match_operand:SI 2 "mask_operand" "P,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(and:SI (match_operand:SI 1 "register_operand")
+		(match_operand:SI 2 "mask_operand")))]
   ""
-  "@
-   extui\t%0, %1, 0, %K2
-   and\t%0, %1, %2"
-  [(set_attr "type"	"arith,arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
+  {@ [cons: =0, %1, 2; attrs: type, length]
+     [a, r, P; arith, 3] extui\t%0, %1, 0, %K2
+     [a, r, r; arith, 3] and\t%0, %1, %2
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn_and_split "*andsi3_bitcmpl"
   [(set (match_operand:SI 0 "register_operand" "=a")
@@ -944,27 +919,15 @@
 
 ;; Zero-extend instructions.
 
-(define_insn "zero_extendhisi2"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(zero_extend:SI (match_operand:HI 1 "nonimmed_operand" "r,U")))]
-  ""
-  "@
-   extui\t%0, %1, 0, 16
-   %v1l16ui\t%0, %1"
-  [(set_attr "type"	"arith,load")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
-
-(define_insn "zero_extendqisi2"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(zero_extend:SI (match_operand:QI 1 "nonimmed_operand" "r,U")))]
+(define_insn "zero_extend<mode>si2"
+  [(set (match_operand:SI 0 "register_operand")
+	(zero_extend:SI (match_operand:HQI 1 "nonimmed_operand")))]
   ""
-  "@
-   extui\t%0, %1, 0, 8
-   %v1l8ui\t%0, %1"
-  [(set_attr "type"	"arith,load")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [a, r; arith, 3] extui\t%0, %1, 0, <mode_bits>
+     [a, U; load , 3] %v1l<mode_bits>ui\t%0, %1
+  }
+  [(set_attr "mode" "SI")])
 
 
 ;; Sign-extend instructions.
@@ -982,15 +945,14 @@
 })
 
 (define_insn "extendhisi2_internal"
-  [(set (match_operand:SI 0 "register_operand" "=B,a")
-	(sign_extend:SI (match_operand:HI 1 "sext_operand" "r,U")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(sign_extend:SI (match_operand:HI 1 "sext_operand")))]
   ""
-  "@
-   sext\t%0, %1, 15
-   %v1l16si\t%0, %1"
-  [(set_attr "type"	"arith,load")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [B, r; arith, 3] sext\t%0, %1, 15
+     [a, U; load , 3] %v1l16si\t%0, %1
+  }
+  [(set_attr "mode" "SI")])
 
 (define_expand "extendqisi2"
   [(set (match_operand:SI 0 "register_operand" "")
@@ -1327,29 +1289,28 @@
 })
 
 (define_insn "movsi_internal"
-  [(set (match_operand:SI 0 "nonimmed_operand" "=D,D,D,D,R,R,a,q,a,a,W,a,a,U,*a,*A")
-	(match_operand:SI 1 "move_operand" "M,D,d,R,D,d,r,r,I,Y,i,T,U,r,*A,*r"))]
+  [(set (match_operand:SI 0 "nonimmed_operand")
+	(match_operand:SI 1 "move_operand"))]
   "xtensa_valid_move (SImode, operands)"
-  "@
-   movi.n\t%0, %x1
-   mov.n\t%0, %1
-   mov.n\t%0, %1
-   %v1l32i.n\t%0, %1
-   %v0s32i.n\t%1, %0
-   %v0s32i.n\t%1, %0
-   mov\t%0, %1
-   movsp\t%0, %1
-   movi\t%0, %x1
-   movi\t%0, %1
-   const16\t%0, %t1\;const16\t%0, %b1
-   %v1l32r\t%0, %1
-   %v1l32i\t%0, %1
-   %v0s32i\t%1, %0
-   rsr\t%0, ACCLO
-   wsr\t%1, ACCLO"
-  [(set_attr "type"	"move,move,move,load,store,store,move,move,move,load,move,load,load,store,rsr,wsr")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"2,2,2,2,2,2,3,3,3,3,6,3,3,3,3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [ D,  M; move , 2] movi.n\t%0, %x1
+     [ D,  D; move , 2] mov.n\t%0, %1
+     [ D,  d; move , 2] ^
+     [ D,  R; load , 2] %v1l32i.n\t%0, %1
+     [ R,  D; store, 2] %v0s32i.n\t%1, %0
+     [ R,  d; store, 2] ^
+     [ a,  r; move , 3] mov\t%0, %1
+     [ q,  r; move , 3] movsp\t%0, %1
+     [ a,  I; move , 3] movi\t%0, %x1
+     [ a,  Y; load , 3] movi\t%0, %1
+     [ W,  i; move , 6] const16\t%0, %t1\;const16\t%0, %b1
+     [ a,  T; load , 3] %v1l32r\t%0, %1
+     [ a,  U; load , 3] %v1l32i\t%0, %1
+     [ U,  r; store, 3] %v0s32i\t%1, %0
+     [*a, *A; rsr  , 3] rsr\t%0, ACCLO
+     [*A, *r; wsr  , 3] wsr\t%1, ACCLO
+  }
+  [(set_attr "mode" "SI")])
 
 (define_split
   [(set (match_operand:SHI 0 "register_operand")
@@ -1399,23 +1360,22 @@
 })
 
 (define_insn "movhi_internal"
-  [(set (match_operand:HI 0 "nonimmed_operand" "=D,D,a,a,a,a,a,U,*a,*A")
-	(match_operand:HI 1 "move_operand" "M,d,r,I,Y,T,U,r,*A,*r"))]
+  [(set (match_operand:HI 0 "nonimmed_operand")
+	(match_operand:HI 1 "move_operand"))]
   "xtensa_valid_move (HImode, operands)"
-  "@
-   movi.n\t%0, %x1
-   mov.n\t%0, %1
-   mov\t%0, %1
-   movi\t%0, %x1
-   movi\t%0, %1
-   %v1l32r\t%0, %1
-   %v1l16ui\t%0, %1
-   %v0s16i\t%1, %0
-   rsr\t%0, ACCLO
-   wsr\t%1, ACCLO"
-  [(set_attr "type"	"move,move,move,move,load,load,load,store,rsr,wsr")
-   (set_attr "mode"	"HI")
-   (set_attr "length"	"2,2,3,3,3,3,3,3,3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [ D,  M; move , 2] movi.n\t%0, %x1
+     [ D,  d; move , 2] mov.n\t%0, %1
+     [ a,  r; move , 3] mov\t%0, %1
+     [ a,  I; move , 3] movi\t%0, %x1
+     [ a,  Y; load , 3] movi\t%0, %1
+     [ a,  T; load , 3] %v1l32r\t%0, %1
+     [ a,  U; load , 3] %v1l16ui\t%0, %1
+     [ U,  r; store, 3] %v0s16i\t%1, %0
+     [*a, *A; rsr  , 3] rsr\t%0, ACCLO
+     [*A, *r; wsr  , 3] wsr\t%1, ACCLO
+  }
+  [(set_attr "mode" "HI")])
 
 ;; 8-bit Integer moves
 
@@ -1429,21 +1389,20 @@
 })
 
 (define_insn "movqi_internal"
-  [(set (match_operand:QI 0 "nonimmed_operand" "=D,D,a,a,a,U,*a,*A")
-	(match_operand:QI 1 "move_operand" "M,d,r,I,U,r,*A,*r"))]
+  [(set (match_operand:QI 0 "nonimmed_operand")
+	(match_operand:QI 1 "move_operand"))]
   "xtensa_valid_move (QImode, operands)"
-  "@
-   movi.n\t%0, %x1
-   mov.n\t%0, %1
-   mov\t%0, %1
-   movi\t%0, %x1
-   %v1l8ui\t%0, %1
-   %v0s8i\t%1, %0
-   rsr\t%0, ACCLO
-   wsr\t%1, ACCLO"
-  [(set_attr "type"	"move,move,move,move,load,store,rsr,wsr")
-   (set_attr "mode"	"QI")
-   (set_attr "length"	"2,2,3,3,3,3,3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [ D,  M; move , 2] movi.n\t%0, %x1
+     [ D,  d; move , 2] mov.n\t%0, %1
+     [ a,  r; move , 3] mov\t%0, %1
+     [ a,  I; move , 3] movi\t%0, %x1
+     [ a,  U; load , 3] %v1l8ui\t%0, %1
+     [ U,  r; store, 3] %v0s8i\t%1, %0
+     [*a, *A; rsr  , 3] rsr\t%0, ACCLO
+     [*A, *r; wsr  , 3] wsr\t%1, ACCLO
+  }
+  [(set_attr "mode" "QI")])
 
 ;; Sub-word reloads from the constant pool.
 
@@ -1501,30 +1460,29 @@
 })
 
 (define_insn "movsf_internal"
-  [(set (match_operand:SF 0 "nonimmed_operand" "=f,f,U,D,a,D,R,a,f,a,a,W,a,U")
-	(match_operand:SF 1 "move_operand" "f,^U,f,d,T,R,d,r,r,f,Y,iF,U,r"))]
+  [(set (match_operand:SF 0 "nonimmed_operand")
+	(match_operand:SF 1 "move_operand"))]
   "((register_operand (operands[0], SFmode)
      || register_operand (operands[1], SFmode))
     && !(FP_REG_P (xt_true_regnum (operands[0]))
 	 && (constantpool_mem_p (operands[1]) || CONSTANT_P (operands[1]))))"
-  "@
-   mov.s\t%0, %1
-   %v1lsi\t%0, %1
-   %v0ssi\t%1, %0
-   mov.n\t%0, %1
-   %v1l32r\t%0, %1
-   %v1l32i.n\t%0, %1
-   %v0s32i.n\t%1, %0
-   mov\t%0, %1
-   wfr\t%0, %1
-   rfr\t%0, %1
-   movi\t%0, %y1
-   const16\t%0, %t1\;const16\t%0, %b1
-   %v1l32i\t%0, %1
-   %v0s32i\t%1, %0"
-  [(set_attr "type"	"farith,fload,fstore,move,load,load,store,move,farith,farith,load,move,load,store")
-   (set_attr "mode"	"SF")
-   (set_attr "length"	"3,3,3,2,3,2,2,3,3,3,3,6,3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [f,  f; farith, 3] mov.s\t%0, %1
+     [f, ^U; fload , 3] %v1lsi\t%0, %1
+     [U,  f; fstore, 3] %v0ssi\t%1, %0
+     [D,  d; move  , 2] mov.n\t%0, %1
+     [a,  T; load  , 3] %v1l32r\t%0, %1
+     [D,  R; load  , 2] %v1l32i.n\t%0, %1
+     [R,  d; store , 2] %v0s32i.n\t%1, %0
+     [a,  r; move  , 3] mov\t%0, %1
+     [f,  r; farith, 3] wfr\t%0, %1
+     [a,  f; farith, 3] rfr\t%0, %1
+     [a,  Y; load  , 3] movi\t%0, %y1
+     [W, iF; move  , 6] const16\t%0, %t1\;const16\t%0, %b1
+     [a,  U; load  , 3] %v1l32i\t%0, %1
+     [U,  r; store , 3] %v0s32i\t%1, %0
+  }
+  [(set_attr "mode" "SF")])
 
 (define_insn "*lsiu"
   [(set (match_operand:SF 0 "register_operand" "=f")
@@ -1692,16 +1650,15 @@
 })
 
 (define_insn "ashlsi3_internal"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(ashift:SI (match_operand:SI 1 "register_operand" "r,r")
-		   (match_operand:SI 2 "arith_operand" "J,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(ashift:SI (match_operand:SI 1 "register_operand")
+		   (match_operand:SI 2 "arith_operand")))]
   ""
-  "@
-   slli\t%0, %1, %R2
-   ssl\t%2\;sll\t%0, %1"
-  [(set_attr "type"	"arith,arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,6")])
+  {@ [cons: =0, 1, 2; attrs: type, length]
+     [a, r, J; arith, 3] slli\t%0, %1, %R2
+     [a, r, r; arith, 6] ssl\t%2\;sll\t%0, %1
+  }
+  [(set_attr "mode" "SI")])
 
 (define_split
   [(set (match_operand:SI 0 "register_operand")
@@ -1713,35 +1670,26 @@
 		 (match_dup 1)))])
 
 (define_insn "ashrsi3"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(ashiftrt:SI (match_operand:SI 1 "register_operand" "r,r")
-		     (match_operand:SI 2 "arith_operand" "J,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(ashiftrt:SI (match_operand:SI 1 "register_operand")
+		     (match_operand:SI 2 "arith_operand")))]
   ""
-  "@
-   srai\t%0, %1, %R2
-   ssr\t%2\;sra\t%0, %1"
-  [(set_attr "type"	"arith,arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,6")])
+  {@ [cons: =0, 1, 2; attrs: type, length]
+     [a, r, J; arith, 3] srai\t%0, %1, %R2
+     [a, r, r; arith, 6] ssr\t%2\;sra\t%0, %1
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn "lshrsi3"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(lshiftrt:SI (match_operand:SI 1 "register_operand" "r,r")
-		     (match_operand:SI 2 "arith_operand" "J,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(lshiftrt:SI (match_operand:SI 1 "register_operand")
+		     (match_operand:SI 2 "arith_operand")))]
   ""
-{
-  if (which_alternative == 0)
-    {
-      if ((INTVAL (operands[2]) & 0x1f) < 16)
-	return "srli\t%0, %1, %R2";
-      else
-	return "extui\t%0, %1, %R2, %L2";
-    }
-  return "ssr\t%2\;srl\t%0, %1";
-}
-  [(set_attr "type"	"arith,arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,6")])
+  {@ [cons: =0, 1, 2; attrs: type, length]
+     [a, r, J; arith, 3] << (INTVAL (operands[2]) & 0x1f) < 16 ? \"srli\t%0, %1, %R2\" : \"extui\t%0, %1, %R2, %L2\";
+     [a, r, r; arith, 6] ssr\t%2\;srl\t%0, %1
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn "*shift_per_byte"
   [(set (match_operand:SI 0 "register_operand" "=a")
@@ -1944,28 +1892,26 @@
    (set_attr "length"	"6")])
 
 (define_insn "rotlsi3"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(rotate:SI (match_operand:SI 1 "register_operand" "r,r")
-		     (match_operand:SI 2 "arith_operand" "J,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(rotate:SI (match_operand:SI 1 "register_operand")
+		     (match_operand:SI 2 "arith_operand")))]
   ""
-  "@
-   ssai\t%L2\;src\t%0, %1, %1
-   ssl\t%2\;src\t%0, %1, %1"
-  [(set_attr "type"	"multi,multi")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"6,6")])
+  {@ [cons: =0, 1, 2; attrs: type, length]
+     [a, r, J; multi, 6] ssai\t%L2\;src\t%0, %1, %1
+     [a, r, r; multi, 6] ssl\t%2\;src\t%0, %1, %1
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn "rotrsi3"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(rotatert:SI (match_operand:SI 1 "register_operand" "r,r")
-		     (match_operand:SI 2 "arith_operand" "J,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(rotatert:SI (match_operand:SI 1 "register_operand")
+		     (match_operand:SI 2 "arith_operand")))]
   ""
-  "@
-   ssai\t%R2\;src\t%0, %1, %1
-   ssr\t%2\;src\t%0, %1, %1"
-  [(set_attr "type"	"multi,multi")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"6,6")])
+  {@ [cons: =0, 1, 2; attrs: type, length]
+     [a, r, J; multi, 6] ssai\t%R2\;src\t%0, %1, %1
+     [a, r, r; multi, 6] ssr\t%2\;src\t%0, %1, %1
+  }
+  [(set_attr "mode" "SI")])
 
 
 ;; Comparisons.
@@ -2024,26 +1970,23 @@
 			[(match_operand:SI 0 "register_operand" "r")
 			 (const_int -2147483648)])
 		      (label_ref (match_operand 1 ""))
-		      (pc)))]
+		      (pc)))
+   (clobber (match_scratch:SI 3 "=a"))]
   "TARGET_ABS"
   "#"
-  "&& can_create_pseudo_p ()"
+  "&& 1"
   [(set (match_dup 3)
 	(abs:SI (match_dup 0)))
    (set (pc)
 	(if_then_else (match_op_dup 2
-			[(zero_extract:SI (match_dup 3)
-					  (const_int 1)
-					  (match_dup 4))
+			[(match_dup 3)
 			 (const_int 0)])
 		      (label_ref (match_dup 1))
 		      (pc)))]
 {
-  operands[3] = gen_reg_rtx (SImode);
-  operands[4] = GEN_INT (BITS_BIG_ENDIAN ? 0 : 31);
-  operands[2] = gen_rtx_fmt_ee (reverse_condition (GET_CODE (operands[2])),
-				VOIDmode, XEXP (operands[2], 0),
-				const0_rtx);
+  if (GET_CODE (operands[3]) == SCRATCH)
+    operands[3] = gen_reg_rtx (SImode);
+  PUT_CODE (operands[2], GET_CODE (operands[2]) == EQ ? LT : GE);
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
@@ -2190,7 +2133,7 @@
 		      (label_ref (match_dup 1))
 		      (pc)))]
 {
-  operands[3] = GEN_INT ((1 << GET_MODE_BITSIZE (GET_MODE (operands[3]))) - 1);
+  operands[3] = GEN_INT (GET_MODE_MASK (GET_MODE (operands[3])));
 })
 
 (define_insn_and_split "*masktrue_const_pow2_minus_one"
@@ -3370,6 +3313,42 @@
 				    (const_int 8)
 				    (const_int 9))))])
 
+(define_insn_and_split "*eqne_in_range"
+  [(set (pc)
+	(if_then_else (match_operator 4 "alt_ubranch_operator"
+			[(plus:SI (match_operand:SI 0 "register_operand" "r")
+				  (match_operand:SI 1 "const_int_operand" "i"))
+			 (match_operand:SI 2 "const_int_operand" "i")])
+		      (label_ref (match_operand 3 ""))
+		      (pc)))
+   (clobber (match_scratch:SI 5 "=&a"))]
+  "TARGET_MINMAX && TARGET_CLAMPS
+   && INTVAL (operands[1]) * 2 - INTVAL (operands[2]) == 1
+   && IN_RANGE (exact_log2 (INTVAL (operands[1])), 7, 22)"
+  "#"
+  "&& 1"
+  [(set (match_dup 5)
+	(smin:SI (smax:SI (match_dup 0)
+			  (match_dup 1))
+		 (match_dup 2)))
+   (set (pc)
+	(if_then_else (match_op_dup 4
+			[(match_dup 0)
+			 (match_dup 5)])
+		      (label_ref (match_dup 3))
+		      (pc)))]
+{
+  HOST_WIDE_INT v = INTVAL (operands[1]);
+  operands[1] = GEN_INT (-v);
+  operands[2] = GEN_INT (v - 1);
+  PUT_CODE (operands[4], GET_CODE (operands[4]) == GTU ? NE : EQ);
+  if (GET_CODE (operands[5]) == SCRATCH)
+    operands[5] = gen_reg_rtx (SImode);
+}
+  [(set_attr "type"	"jump")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"6")])
+
 (define_split
   [(clobber (match_operand 0 "register_operand"))]
   "HARD_REGISTER_P (operands[0])