Merge branch 'master' into gfortran-testdevel/gfortran-test

author: Jerry DeLisle <jvdelisle@gcc.gnu.org> 2025-09-02 15:58:26 -0700
committer: Jerry DeLisle <jvdelisle@gcc.gnu.org> 2025-09-02 15:58:26 -0700
commit: 071b4126c613881f4cb25b4e5c39032964827f88 (patch)
tree: 7ed805786566918630d1d617b1ed8f7310f5fd8e /gcc/config
parent: 845d23f3ea08ba873197c275a8857eee7edad996 (diff)
parent: caa1c2f42691d68af4d894a5c3e700ecd2dba080 (diff)
download: gcc-devel/gfortran-test.zip
gcc-devel/gfortran-test.tar.gz
gcc-devel/gfortran-test.tar.bz2
123 files changed, 7549 insertions, 3659 deletions
diff --git a/gcc/config/aarch64/aarch64-cc-fusion.cc b/gcc/config/aarch64/aarch64-cc-fusion.cc
deleted file mode 100644
index cea54de..0000000
--- a/gcc/config/aarch64/aarch64-cc-fusion.cc
+++ /dev/null
@@ -1,297 +0,0 @@
-// Pass to fuse CC operations with other instructions.
-// Copyright (C) 2021-2025 Free Software Foundation, Inc.
-//
-// This file is part of GCC.
-//
-// GCC is free software; you can redistribute it and/or modify it under
-// the terms of the GNU General Public License as published by the Free
-// Software Foundation; either version 3, or (at your option) any later
-// version.
-//
-// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or
-// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-// for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with GCC; see the file COPYING3.  If not see
-// <http://www.gnu.org/licenses/>.
-
-// This pass looks for sequences of the form:
-//
-//    A: (set (reg R1) X1)
-//    B: ...instructions that might change the value of X1...
-//    C: (set (reg CC) X2) // X2 uses R1
-//
-// and tries to change them to:
-//
-//    C': [(set (reg CC) X2')
-//         (set (reg R1) X1)]
-//    B: ...instructions that might change the value of X1...
-//
-// where X2' is the result of replacing R1 with X1 in X2.
-//
-// This sequence occurs in SVE code in two important cases:
-//
-// (a) Sometimes, to deal correctly with overflow, we need to increment
-//     an IV after a WHILELO rather than before it.  In this case:
-//     - A is a WHILELO,
-//     - B includes an IV increment and
-//     - C is a separate PTEST.
-//
-// (b) ACLE code of the form:
-//
-//       svbool_t ok = svrdffr ();
-//       if (svptest_last (pg, ok))
-//         ...
-//
-//     must, for performance reasons, be code-generated as:
-//
-//       RDFFRS Pok.B, Pg/Z
-//       ...branch on flags result...
-//
-//     without a separate PTEST of Pok.  In this case:
-//     - A is an aarch64_rdffr
-//     - B includes an aarch64_update_ffrt
-//     - C is a separate PTEST
-//
-// Combine can handle this optimization if B doesn't exist and if A and
-// C are in the same BB.  This pass instead handles cases where B does
-// exist and cases where A and C are in different BBs of the same EBB.
-
-#define IN_TARGET_CODE 1
-
-#define INCLUDE_ALGORITHM
-#define INCLUDE_FUNCTIONAL
-#define INCLUDE_ARRAY
-#include "config.h"
-#include "system.h"
-#include "coretypes.h"
-#include "backend.h"
-#include "rtl.h"
-#include "df.h"
-#include "rtl-ssa.h"
-#include "tree-pass.h"
-
-using namespace rtl_ssa;
-
-namespace {
-const pass_data pass_data_cc_fusion =
-{
-  RTL_PASS, // type
-  "cc_fusion", // name
-  OPTGROUP_NONE, // optinfo_flags
-  TV_NONE, // tv_id
-  0, // properties_required
-  0, // properties_provided
-  0, // properties_destroyed
-  0, // todo_flags_start
-  TODO_df_finish, // todo_flags_finish
-};
-
-// Class that represents one run of the pass.
-class cc_fusion
-{
-public:
-  cc_fusion ()  : m_parallel () {}
-  void execute ();
-
-private:
-  rtx optimizable_set (const insn_info *);
-  bool parallelize_insns (def_info *, rtx, def_info *, rtx);
-  void optimize_cc_setter (def_info *, rtx);
-
-  // A spare PARALLEL rtx, or null if none.
-  rtx m_parallel;
-};
-
-// See whether INSN is a single_set that we can optimize.  Return the
-// set if so, otherwise return null.
-rtx
-cc_fusion::optimizable_set (const insn_info *insn)
-{
-  if (!insn->can_be_optimized ()
-      || insn->is_asm ()
-      || insn->has_volatile_refs ()
-      || insn->has_pre_post_modify ())
-    return NULL_RTX;
-
-  return single_set (insn->rtl ());
-}
-
-// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise
-// a single_set that sets (only) OTHER_DEF.  CC_SET is known to set the
-// CC register and the instruction that contains CC_SET is known to use
-// OTHER_DEF.  Try to do CC_SET and OTHER_SET in parallel.
-bool
-cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set,
-			      def_info *other_def, rtx other_set)
-{
-  auto attempt = crtl->ssa->new_change_attempt ();
-
-  insn_info *cc_insn = cc_def->insn ();
-  insn_info *other_insn = other_def->insn ();
-  if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, "trying to parallelize insn %d and insn %d\n",
-	     other_insn->uid (), cc_insn->uid ());
-
-  // Try to substitute OTHER_SET into CC_INSN.
-  insn_change_watermark rtl_watermark;
-  rtx_insn *cc_rtl = cc_insn->rtl ();
-  insn_propagation prop (cc_rtl, SET_DEST (other_set),
-			 SET_SRC (other_set));
-  if (!prop.apply_to_pattern (&PATTERN (cc_rtl))
-      || prop.num_replacements == 0)
-    {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file, "-- failed to substitute all uses of r%d\n",
-		 other_def->regno ());
-      return false;
-    }
-
-  // Restrict the uses to those outside notes.
-  use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ());
-  use_array other_set_uses = remove_note_accesses (attempt,
-						   other_insn->uses ());
-
-  // Remove the use of the substituted value.
-  access_array_builder uses_builder (attempt);
-  uses_builder.reserve (cc_uses.size ());
-  for (use_info *use : cc_uses)
-    if (use->def () != other_def)
-      uses_builder.quick_push (use);
-  cc_uses = use_array (uses_builder.finish ());
-
-  // Get the list of uses for the new instruction.
-  insn_change cc_change (cc_insn);
-  cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses);
-  if (!cc_change.new_uses.is_valid ())
-    {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file, "-- cannot merge uses\n");
-      return false;
-    }
-
-  // The instruction initially defines just two registers.  recog can add
-  // extra clobbers if necessary.
-  auto_vec<access_info *, 2> new_defs;
-  new_defs.quick_push (cc_def);
-  new_defs.quick_push (other_def);
-  sort_accesses (new_defs);
-  cc_change.new_defs = def_array (access_array (new_defs));
-
-  // Make sure there is somewhere that the new instruction could live.
-  auto other_change = insn_change::delete_insn (other_insn);
-  insn_change *changes[] = { &other_change, &cc_change };
-  cc_change.move_range = cc_insn->ebb ()->insn_range ();
-  if (!restrict_movement (cc_change, ignore_changing_insns (changes)))
-    {
-      if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file, "-- cannot satisfy all definitions and uses\n");
-      return false;
-    }
-
-  // Tentatively install the new pattern.  By convention, the CC set
-  // must be first.
-  if (m_parallel)
-    {
-      XVECEXP (m_parallel, 0, 0) = cc_set;
-      XVECEXP (m_parallel, 0, 1) = other_set;
-    }
-  else
-    {
-      rtvec vec = gen_rtvec (2, cc_set, other_set);
-      m_parallel = gen_rtx_PARALLEL (VOIDmode, vec);
-    }
-  validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1);
-
-  // These routines report failures themselves.
-  if (!recog (attempt, cc_change, ignore_changing_insns (changes))
-      || !changes_are_worthwhile (changes)
-      || !crtl->ssa->verify_insn_changes (changes))
-    return false;
-
-  remove_reg_equal_equiv_notes (cc_rtl);
-  confirm_change_group ();
-  crtl->ssa->change_insns (changes);
-  m_parallel = NULL_RTX;
-  return true;
-}
-
-// Try to optimize the instruction that contains CC_DEF, where CC_DEF describes
-// a definition of the CC register by CC_SET.
-void
-cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set)
-{
-  // Search the registers used by the CC setter for an easily-substitutable
-  // def-use chain.
-  for (use_info *other_use : cc_def->insn ()->uses ())
-    if (def_info *other_def = other_use->def ())
-      if (other_use->regno () != CC_REGNUM
-	  && other_def->ebb () == cc_def->ebb ())
-	if (rtx other_set = optimizable_set (other_def->insn ()))
-	  {
-	    rtx dest = SET_DEST (other_set);
-	    if (REG_P (dest)
-		&& REGNO (dest) == other_def->regno ()
-		&& REG_NREGS (dest) == 1
-		&& parallelize_insns (cc_def, cc_set, other_def, other_set))
-	      return;
-	  }
-}
-
-// Run the pass on the current function.
-void
-cc_fusion::execute ()
-{
-  // Initialization.
-  calculate_dominance_info (CDI_DOMINATORS);
-  df_analyze ();
-  crtl->ssa = new rtl_ssa::function_info (cfun);
-
-  // Walk through all instructions that set CC.  Look for a PTEST instruction
-  // that we can optimize.
-  //
-  // ??? The PTEST test isn't needed for correctness, but it ensures that the
-  // pass no effect on non-SVE code.
-  for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM))
-    if (rtx cc_set = optimizable_set (def->insn ()))
-      if (REG_P (SET_DEST (cc_set))
-	  && REGNO (SET_DEST (cc_set)) == CC_REGNUM
-	  && GET_CODE (SET_SRC (cc_set)) == UNSPEC
-	  && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST)
-	optimize_cc_setter (def, cc_set);
-
-  // Finalization.
-  crtl->ssa->perform_pending_updates ();
-  free_dominance_info (CDI_DOMINATORS);
-}
-
-class pass_cc_fusion : public rtl_opt_pass
-{
-public:
-  pass_cc_fusion (gcc::context *ctxt)
-    : rtl_opt_pass (pass_data_cc_fusion, ctxt)
-  {}
-
-  // opt_pass methods:
-  virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; }
-  virtual unsigned int execute (function *);
-};
-
-unsigned int
-pass_cc_fusion::execute (function *)
-{
-  cc_fusion ().execute ();
-  return 0;
-}
-
-} // end namespace
-
-// Create a new CC fusion pass instance.
-
-rtl_opt_pass *
-make_pass_cc_fusion (gcc::context *ctxt)
-{
-  return new pass_cc_fusion (ctxt);
-}
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 8040409..6f11cc0 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -224,7 +224,7 @@ AARCH64_CORE("neoverse-v3ae", neoversev3ae, cortexa57, V9_2A, (SVE2_BITPERM, RNG
 AARCH64_CORE("demeter", demeter, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1)
 
 /* NVIDIA ('N') cores. */
-AARCH64_CORE("olympus", olympus, cortexa57, V9_2A, (SVE2_BITPERM, RNG, LS64, MEMTAG, PROFILE, FAMINMAX, FP8FMA, FP8DOT2, FP8DOT4, LUT, SVE2_AES, SVE2_SHA3, SVE2_SM4), neoversev3, 0x4e, 0x10, -1)
+AARCH64_CORE("olympus", olympus, cortexa57, V9_2A, (SVE2_BITPERM, RNG, LS64, MEMTAG, PROFILE, FAMINMAX, FP8FMA, FP8DOT2, FP8DOT4, LUT, SVE2_AES, SVE2_SHA3, SVE2_SM4), olympus, 0x4e, 0x10, -1)
 
 /* Armv9-A big.LITTLE processors.  */
 AARCH64_CORE("gb10",  gb10, cortexa57, V9_2A,  (SVE2_BITPERM, SVE2_AES, SVE2_SHA3, SVE2_SM4, MEMTAG, PROFILE), cortexx925, 0x41, AARCH64_BIG_LITTLE (0xd85, 0xd87), -1)
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 1c3e697..db88df0 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -128,7 +128,9 @@ AARCH64_OPT_FMV_EXTENSION("sha2", SHA2, (SIMD), (), (), "sha1 sha2")
 
 AARCH64_FMV_FEATURE("sha3", SHA3, (SHA3))
 
-AARCH64_OPT_FMV_EXTENSION("aes", AES, (SIMD), (), (), "aes")
+AARCH64_OPT_EXTENSION("aes", AES, (SIMD), (), (), "aes")
+
+AARCH64_FMV_FEATURE("aes", PMULL, (AES))
 
 /* +nocrypto disables AES, SHA2 and SM4, and anything that depends on them
    (such as SHA3 and the SVE2 crypto extensions).  */
@@ -171,8 +173,6 @@ AARCH64_OPT_FMV_EXTENSION("i8mm", I8MM, (SIMD), (), (), "i8mm")
    instructions.  */
 AARCH64_OPT_FMV_EXTENSION("bf16", BF16, (FP), (SIMD), (), "bf16")
 
-AARCH64_FMV_FEATURE("rpres", RPRES, ())
-
 AARCH64_OPT_FMV_EXTENSION("sve", SVE, (SIMD, F16, FCMA), (), (), "sve")
 
 /* This specifically does not imply +sve.  */
@@ -190,7 +190,7 @@ AARCH64_OPT_FMV_EXTENSION("sve2", SVE2, (SVE), (), (), "sve2")
 
 AARCH64_OPT_EXTENSION("sve2-aes", SVE2_AES, (SVE2, AES), (), (), "sveaes")
 
-AARCH64_FMV_FEATURE("sve2-aes", SVE_AES, (SVE2_AES))
+AARCH64_FMV_FEATURE("sve2-aes", SVE_PMULL128, (SVE2_AES))
 
 AARCH64_OPT_EXTENSION("sve2-bitperm", SVE2_BITPERM, (SVE2), (), (),
 		      "svebitperm")
@@ -245,9 +245,9 @@ AARCH64_OPT_EXTENSION("sme-b16b16", SME_B16B16, (SME2, SVE_B16B16), (), (), "sme
 
 AARCH64_OPT_EXTENSION("sme-f16f16", SME_F16F16, (SME2), (), (), "smef16f16")
 
-AARCH64_OPT_EXTENSION("mops", MOPS, (), (), (), "mops")
+AARCH64_OPT_FMV_EXTENSION("mops", MOPS, (), (), (), "mops")
 
-AARCH64_OPT_EXTENSION("cssc", CSSC, (), (), (), "cssc")
+AARCH64_OPT_FMV_EXTENSION("cssc", CSSC, (), (), (), "cssc")
 
 AARCH64_OPT_EXTENSION("cmpbr", CMPBR, (), (), (), "cmpbr")
 
diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def
index 9cf9d3e..6a53ff3 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -24,6 +24,5 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation);
 INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_switch_pstate_sm);
 INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_late_track_speculation);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
-INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
 INSERT_PASS_BEFORE (pass_early_remat, 1, pass_ldp_fusion);
 INSERT_PASS_BEFORE (pass_peephole2, 1, pass_ldp_fusion);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index e946e8d..56efcf2 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1020,7 +1020,7 @@ void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (rtx_call_insn *);
 rtx aarch64_ptrue_all (unsigned int);
 opt_machine_mode aarch64_ptrue_all_mode (rtx);
-rtx aarch64_convert_sve_data_to_pred (rtx, machine_mode, rtx);
+rtx aarch64_convert_sve_data_to_pred (rtx, rtx);
 rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx);
 void aarch64_expand_mov_immediate (rtx, rtx);
 rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type);
@@ -1031,6 +1031,7 @@ rtx aarch64_pfalse_reg (machine_mode);
 bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
 rtx aarch64_sve_packed_pred (machine_mode);
 rtx aarch64_sve_fp_pred (machine_mode, rtx *);
+rtx aarch64_sve_emit_masked_fp_pred (machine_mode, rtx);
 void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode);
 bool aarch64_expand_maskloadstore (rtx *, machine_mode);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
@@ -1038,6 +1039,7 @@ void aarch64_expand_sve_mem_move (rtx, rtx, machine_mode);
 bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx);
 rtx aarch64_replace_reg_mode (rtx, machine_mode);
 void aarch64_split_sve_subreg_move (rtx, rtx, rtx);
+void aarch64_emit_sve_pred_vec_duplicate (machine_mode, rtx, rtx);
 void aarch64_expand_prologue (void);
 void aarch64_decompose_vec_struct_index (machine_mode, rtx *, rtx *, bool);
 void aarch64_expand_vector_init (rtx, rtx);
@@ -1096,6 +1098,7 @@ bool aarch64_legitimate_address_p (machine_mode, rtx, bool,
 				   aarch64_addr_query_type = ADDR_QUERY_M);
 machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
 rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
+rtx aarch64_gen_compare_split_imm24 (rtx, rtx, rtx);
 bool aarch64_maxmin_plus_const (rtx_code, rtx *, bool);
 rtx aarch64_load_tp (rtx);
 
@@ -1234,7 +1237,6 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *);
 rtl_opt_pass *make_pass_track_speculation (gcc::context *);
 rtl_opt_pass *make_pass_late_track_speculation (gcc::context *);
 rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
-rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
 rtl_opt_pass *make_pass_switch_pstate_sm (gcc::context *ctxt);
 rtl_opt_pass *make_pass_ldp_fusion (gcc::context *);
 
@@ -1279,4 +1281,7 @@ extern bool aarch64_gcs_enabled ();
 extern unsigned aarch64_data_alignment (const_tree exp, unsigned align);
 extern unsigned aarch64_stack_alignment (const_tree exp, unsigned align);
 
+extern rtx aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
+						rtx_code_label *label);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md
index 6b3f439..0123ea0 100644
--- a/gcc/config/aarch64/aarch64-sme.md
+++ b/gcc/config/aarch64/aarch64-sme.md
@@ -62,6 +62,10 @@
 ;; (b) they are sometimes used conditionally, particularly in streaming-
 ;; compatible code.
 ;;
+;; To prevent the latter from upsetting the assembler, we emit the literal
+;; encodings of "SMSTART SM" and "SMSTOP SM" when compiling without
+;; TARGET_SME.
+;;
 ;; =========================================================================
 
 ;; -------------------------------------------------------------------------
@@ -161,7 +165,9 @@
    (clobber (reg:VNx16BI P14_REGNUM))
    (clobber (reg:VNx16BI P15_REGNUM))]
   ""
-  "smstart\tsm"
+  {
+    return TARGET_SME ? "smstart\tsm" : ".inst 0xd503437f // smstart sm";
+  }
 )
 
 ;; Turn off streaming mode.  This clobbers all SVE state.
@@ -196,7 +202,9 @@
    (clobber (reg:VNx16BI P14_REGNUM))
    (clobber (reg:VNx16BI P15_REGNUM))]
   ""
-  "smstop\tsm"
+  {
+    return TARGET_SME ? "smstop\tsm" : ".inst 0xd503427f // smstop sm";
+  }
 )
 
 ;; -------------------------------------------------------------------------
@@ -392,7 +400,8 @@
     auto label = gen_label_rtx ();
     auto tpidr2 = gen_rtx_REG (DImode, R16_REGNUM);
     emit_insn (gen_aarch64_read_tpidr2 (tpidr2));
-    auto jump = emit_likely_jump_insn (gen_aarch64_cbznedi1 (tpidr2, label));
+    auto pat = aarch64_gen_compare_zero_and_branch (NE, tpidr2, label);
+    auto jump = emit_likely_jump_insn (pat);
     JUMP_LABEL (jump) = label;
 
     aarch64_restore_za (operands[0]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index b439683..ecc0687 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -214,7 +214,8 @@ public:
   expand (function_expander &e) const override
   {
     e.add_ptrue_hint (0, e.gp_mode (0));
-    insn_code icode = code_for_aarch64_pred_fac (m_unspec, e.vector_mode (0));
+    insn_code icode = code_for_aarch64_pred_fac_acle (m_unspec,
+						      e.vector_mode (0));
     return e.use_exact_insn (icode);
   }
 
@@ -497,10 +498,10 @@ public:
       {
 	bool unsigned_p = e.type_suffix (0).unsigned_p;
 	rtx_code code = get_rtx_code (m_code, unsigned_p);
-	return e.use_exact_insn (code_for_aarch64_pred_cmp (code, mode));
+	return e.use_exact_insn (code_for_aarch64_pred_cmp_acle (code, mode));
       }
 
-    insn_code icode = code_for_aarch64_pred_fcm (m_unspec_for_fp, mode);
+    insn_code icode = code_for_aarch64_pred_fcm_acle (m_unspec_for_fp, mode);
     return e.use_exact_insn (icode);
   }
 
@@ -542,7 +543,7 @@ public:
 
     /* If the argument is a constant that the unwidened comparisons
        can handle directly, use them instead.  */
-    insn_code icode = code_for_aarch64_pred_cmp (code, mode);
+    insn_code icode = code_for_aarch64_pred_cmp_acle (code, mode);
     rtx op2 = unwrap_const_vec_duplicate (e.args[3]);
     if (CONSTANT_P (op2)
 	&& insn_data[icode].operand[4].predicate (op2, DImode))
@@ -581,7 +582,8 @@ public:
   expand (function_expander &e) const override
   {
     e.add_ptrue_hint (0, e.gp_mode (0));
-    return e.use_exact_insn (code_for_aarch64_pred_fcmuo (e.vector_mode (0)));
+    auto mode = e.vector_mode (0);
+    return e.use_exact_insn (code_for_aarch64_pred_fcmuo_acle (mode));
   }
 };
 
@@ -1048,6 +1050,23 @@ public:
   rtx
   expand (function_expander &e) const override
   {
+    machine_mode mode = e.vector_mode (0);
+    if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+      {
+	gcc_assert (e.pred == PRED_none);
+
+	rtx src = e.args[0];
+	if (GET_CODE (src) == CONST_INT)
+	  return (src == const0_rtx
+		  ? CONST0_RTX (VNx16BImode)
+		  : aarch64_ptrue_all (e.type_suffix (0).element_bytes));
+
+	rtx dest = e.get_reg_target ();
+	src = force_reg (GET_MODE (src), src);
+	aarch64_emit_sve_pred_vec_duplicate (mode, dest, src);
+	return dest;
+      }
+
     if (e.pred == PRED_none || e.pred == PRED_x)
       /* There's no benefit to using predicated instructions for _x here.  */
       return e.use_unpred_insn (e.direct_optab_handler (vec_duplicate_optab));
@@ -1056,7 +1075,6 @@ public:
        the duplicate of the function argument and the "false" value
        is the value of inactive lanes.  */
     insn_code icode;
-    machine_mode mode = e.vector_mode (0);
     if (valid_for_const_vector_p (GET_MODE_INNER (mode), e.args.last ()))
       /* Duplicate the constant to fill a vector.  The pattern optimizes
 	 various cases involving constant operands, falling back to SEL
@@ -1197,8 +1215,7 @@ public:
     if (mode != e.vector_mode (0))
       {
 	rtx data_dupq = aarch64_expand_sve_dupq (NULL, mode, vq_reg);
-	return aarch64_convert_sve_data_to_pred (e.possible_target,
-						 e.vector_mode (0), data_dupq);
+	return aarch64_convert_sve_data_to_pred (e.possible_target, data_dupq);
       }
 
     return aarch64_expand_sve_dupq (e.possible_target, mode, vq_reg);
@@ -1259,9 +1276,10 @@ public:
 	index = target;
       }
 
-    e.args[0] = gen_lowpart (VNx2DImode, e.args[0]);
+    e.args[0] = aarch64_sve_reinterpret (VNx2DImode, e.args[0]);
     e.args[1] = index;
-    return e.use_exact_insn (CODE_FOR_aarch64_sve_tblvnx2di);
+    rtx res = e.use_exact_insn (CODE_FOR_aarch64_sve_tblvnx2di);
+    return aarch64_sve_reinterpret (mode, res);
   }
 };
 
@@ -2857,7 +2875,10 @@ public:
   rtx
   expand (function_expander &e) const override
   {
-    return e.use_exact_insn (code_for_aarch64_sve_rev (e.vector_mode (0)));
+    auto mode = e.vector_mode (0);
+    return e.use_exact_insn (e.type_suffix (0).bool_p
+			     ? code_for_aarch64_sve_rev_acle (mode)
+			     : code_for_aarch64_sve_rev (mode));
   }
 };
 
@@ -3248,7 +3269,7 @@ public:
     unsigned int unpacks = m_high_p ? UNSPEC_UNPACKSHI : UNSPEC_UNPACKSLO;
     insn_code icode;
     if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
-      icode = code_for_aarch64_sve_punpk (unpacku, mode);
+      icode = code_for_aarch64_sve_punpk_acle (unpacku);
     else
       {
 	int unspec = e.type_suffix (0).unsigned_p ? unpacku : unpacks;
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-functions.h b/gcc/config/aarch64/aarch64-sve-builtins-functions.h
index 6f1c694..c05946d 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-functions.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins-functions.h
@@ -630,7 +630,10 @@ public:
   rtx
   expand (function_expander &e) const override
   {
-    insn_code icode = code_for_aarch64_sve (m_unspec, e.vector_mode (0));
+    auto mode = e.vector_mode (0);
+    insn_code icode = (e.type_suffix (0).bool_p
+		       ? code_for_aarch64_sve_acle (m_unspec, mode)
+		       : code_for_aarch64_sve (m_unspec, mode));
     return e.use_exact_insn (icode);
   }
 
@@ -838,7 +841,8 @@ public:
 
     machine_mode pred_mode = e.vector_mode (0);
     scalar_mode reg_mode = GET_MODE_INNER (e.vector_mode (1));
-    return e.use_exact_insn (code_for_while (unspec, reg_mode, pred_mode));
+    auto icode = code_for_aarch64_sve_while_acle (unspec, reg_mode, pred_mode);
+    return e.use_exact_insn (icode);
   }
 
   /* The unspec codes associated with signed and unsigned operations
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sme.def b/gcc/config/aarch64/aarch64-sve-builtins-sme.def
index 8e6aadc..117b70e 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sme.def
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sme.def
@@ -92,7 +92,8 @@ DEF_SME_FUNCTION (svstr_zt, str_zt, none, none)
 DEF_SME_FUNCTION (svzero_zt, inherent_zt, none, none)
 #undef REQUIRED_EXTENSIONS
 
-#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 && AARCH64_FL_FAMINMAX)
+#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 \
+					    | AARCH64_FL_FAMINMAX)
 DEF_SME_FUNCTION_GS (svamin, binary_opt_single_n, all_float, x24, none)
 DEF_SME_FUNCTION_GS (svamax, binary_opt_single_n, all_float, x24, none)
 #undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
index 73004a8..95c5ed8 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
@@ -881,7 +881,9 @@ public:
   {
     for (unsigned int i = 0; i < 2; ++i)
       e.args[i] = e.convert_to_pmode (e.args[i]);
-    return e.use_exact_insn (code_for_while (m_unspec, Pmode, e.gp_mode (0)));
+    auto icode = code_for_aarch64_sve_while_acle (m_unspec, Pmode,
+						  e.gp_mode (0));
+    return e.use_exact_insn (icode);
   }
 
   int m_unspec;
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 2b627a9..1764cf8 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -4004,7 +4004,8 @@ rtx
 function_expander::get_reg_target ()
 {
   machine_mode target_mode = result_mode ();
-  if (!possible_target || GET_MODE (possible_target) != target_mode)
+  if (!possible_target
+      || !register_operand (possible_target, target_mode))
     possible_target = gen_reg_rtx (target_mode);
   return possible_target;
 }
@@ -4589,10 +4590,31 @@ function_expander::expand ()
     {
       /* The last element of these functions is always an fpm_t that must be
          written to FPMR before the call to the instruction itself. */
-      gcc_assert (args.last ()->mode == DImode);
-      emit_move_insn (gen_rtx_REG (DImode, FPM_REGNUM), args.last ());
+      rtx fpm = args.last ();
+      gcc_assert (CONST_INT_P (fpm) || GET_MODE (fpm) == DImode);
+      emit_move_insn (gen_rtx_REG (DImode, FPM_REGNUM), fpm);
     }
-  return base->expand (*this);
+  rtx result = base->expand (*this);
+  if (function_returns_void_p ())
+    gcc_assert (result == const0_rtx);
+  else
+    {
+      auto expected_mode = result_mode ();
+      if (GET_MODE_CLASS (expected_mode) == MODE_INT)
+	/* Scalar integer constants don't store a mode.
+
+	   It's OK for a variable result to have a different mode from the
+	   function return type.  In particular, some functions that return int
+	   expand into instructions that have a DImode result, with all 64 bits
+	   of the DImode being well-defined (usually zero).  */
+	gcc_assert (CONST_SCALAR_INT_P (result)
+		    || GET_MODE_CLASS (GET_MODE (result)) == MODE_INT);
+      else
+	/* In other cases, the return value should have the same mode
+	   as the return type.  */
+	gcc_assert (GET_MODE (result) == expected_mode);
+    }
+  return result;
 }
 
 /* Return a structure type that contains a single field of type FIELD_TYPE.
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index b252eef..51e2d7d 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1542,18 +1542,18 @@
 	  UNSPEC_LD1_GATHER))]
   "TARGET_SVE && TARGET_NON_STREAMING"
   {@ [cons: =0, 1, 2, 3, 4, 5  ]
-     [&w, Z,   w, Ui1, Ui1, Upl] ld1<Vesize>\t%0.s, %5/z, [%2.s]
-     [?w, Z,   0, Ui1, Ui1, Upl] ^
-     [&w, vgw, w, Ui1, Ui1, Upl] ld1<Vesize>\t%0.s, %5/z, [%2.s, #%1]
-     [?w, vgw, 0, Ui1, Ui1, Upl] ^
-     [&w, rk,  w, Z,   Ui1, Upl] ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw]
-     [?w, rk,  0, Z,   Ui1, Upl] ^
-     [&w, rk,  w, Ui1, Ui1, Upl] ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw]
-     [?w, rk,  0, Ui1, Ui1, Upl] ^
-     [&w, rk,  w, Z,   i,   Upl] ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
-     [?w, rk,  0, Z,   i,   Upl] ^
-     [&w, rk,  w, Ui1, i,   Upl] ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw %p4]
-     [?w, rk,  0, Ui1, i,   Upl] ^
+     [&w, Z,          w, Ui1, Ui1, Upl] ld1<Vesize>\t%0.s, %5/z, [%2.s]
+     [?w, Z,          0, Ui1, Ui1, Upl] ^
+     [&w, vg<Vesize>, w, Ui1, Ui1, Upl] ld1<Vesize>\t%0.s, %5/z, [%2.s, #%1]
+     [?w, vg<Vesize>, 0, Ui1, Ui1, Upl] ^
+     [&w, rk,         w, Z,   Ui1, Upl] ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw]
+     [?w, rk,         0, Z,   Ui1, Upl] ^
+     [&w, rk,         w, Ui1, Ui1, Upl] ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw]
+     [?w, rk,         0, Ui1, Ui1, Upl] ^
+     [&w, rk,         w, Z,   i,   Upl] ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
+     [?w, rk,         0, Z,   i,   Upl] ^
+     [&w, rk,         w, Ui1, i,   Upl] ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw %p4]
+     [?w, rk,         0, Ui1, i,   Upl] ^
   }
 )
 
@@ -1572,14 +1572,14 @@
 	  UNSPEC_LD1_GATHER))]
   "TARGET_SVE && TARGET_NON_STREAMING"
   {@ [cons: =0, 1, 2, 3, 4, 5]
-     [&w, Z,   w, i, Ui1, Upl] ld1<Vesize>\t%0.d, %5/z, [%2.d]
-     [?w, Z,   0, i, Ui1, Upl] ^
-     [&w, vgd, w, i, Ui1, Upl] ld1<Vesize>\t%0.d, %5/z, [%2.d, #%1]
-     [?w, vgd, 0, i, Ui1, Upl] ^
-     [&w, rk,  w, i, Ui1, Upl] ld1<Vesize>\t%0.d, %5/z, [%1, %2.d]
-     [?w, rk,  0, i, Ui1, Upl] ^
-     [&w, rk,  w, i, i,   Upl] ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, lsl %p4]
-     [?w, rk,  0, i, i,   Upl] ^
+     [&w, Z,          w, i, Ui1, Upl] ld1<Vesize>\t%0.d, %5/z, [%2.d]
+     [?w, Z,          0, i, Ui1, Upl] ^
+     [&w, vg<Vesize>, w, i, Ui1, Upl] ld1<Vesize>\t%0.d, %5/z, [%2.d, #%1]
+     [?w, vg<Vesize>, 0, i, Ui1, Upl] ^
+     [&w, rk,         w, i, Ui1, Upl] ld1<Vesize>\t%0.d, %5/z, [%1, %2.d]
+     [?w, rk,         0, i, Ui1, Upl] ^
+     [&w, rk,         w, i, i,   Upl] ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, lsl %p4]
+     [?w, rk,         0, i, i,   Upl] ^
   }
 )
 
@@ -2488,13 +2488,13 @@
 	   (match_operand:SVE_4 4 "register_operand")]
 	  UNSPEC_ST1_SCATTER))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: 0 , 1 , 2   , 3   , 4 , 5    ]
-     [ Z       , w , Ui1 , Ui1 , w , Upl  ] st1<Vesize>\t%4.s, %5, [%1.s]
-     [ vgw     , w , Ui1 , Ui1 , w , Upl  ] st1<Vesize>\t%4.s, %5, [%1.s, #%0]
-     [ rk      , w , Z   , Ui1 , w , Upl  ] st1<Vesize>\t%4.s, %5, [%0, %1.s, sxtw]
-     [ rk      , w , Ui1 , Ui1 , w , Upl  ] st1<Vesize>\t%4.s, %5, [%0, %1.s, uxtw]
-     [ rk      , w , Z   , i   , w , Upl  ] st1<Vesize>\t%4.s, %5, [%0, %1.s, sxtw %p3]
-     [ rk      , w , Ui1 , i   , w , Upl  ] st1<Vesize>\t%4.s, %5, [%0, %1.s, uxtw %p3]
+  {@ [ cons: 0    , 1 , 2   , 3   , 4 , 5    ]
+     [ Z          , w , Ui1 , Ui1 , w , Upl  ] st1<Vesize>\t%4.s, %5, [%1.s]
+     [ vg<Vesize> , w , Ui1 , Ui1 , w , Upl  ] st1<Vesize>\t%4.s, %5, [%1.s, #%0]
+     [ rk         , w , Z   , Ui1 , w , Upl  ] st1<Vesize>\t%4.s, %5, [%0, %1.s, sxtw]
+     [ rk         , w , Ui1 , Ui1 , w , Upl  ] st1<Vesize>\t%4.s, %5, [%0, %1.s, uxtw]
+     [ rk         , w , Z   , i   , w , Upl  ] st1<Vesize>\t%4.s, %5, [%0, %1.s, sxtw %p3]
+     [ rk         , w , Ui1 , i   , w , Upl  ] st1<Vesize>\t%4.s, %5, [%0, %1.s, uxtw %p3]
   }
 )
 
@@ -2511,11 +2511,11 @@
 	   (match_operand:SVE_2 4 "register_operand")]
 	  UNSPEC_ST1_SCATTER))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: 0 , 1 , 3   , 4 , 5    ]
-     [ Z       , w , Ui1 , w , Upl  ] st1<Vesize>\t%4.d, %5, [%1.d]
-     [ vgd     , w , Ui1 , w , Upl  ] st1<Vesize>\t%4.d, %5, [%1.d, #%0]
-     [ rk      , w , Ui1 , w , Upl  ] st1<Vesize>\t%4.d, %5, [%0, %1.d]
-     [ rk      , w , i   , w , Upl  ] st1<Vesize>\t%4.d, %5, [%0, %1.d, lsl %p3]
+  {@ [ cons: 0    , 1 , 3   , 4 , 5    ]
+     [ Z          , w , Ui1 , w , Upl  ] st1<Vesize>\t%4.d, %5, [%1.d]
+     [ vg<Vesize> , w , Ui1 , w , Upl  ] st1<Vesize>\t%4.d, %5, [%1.d, #%0]
+     [ rk         , w , Ui1 , w , Upl  ] st1<Vesize>\t%4.d, %5, [%0, %1.d]
+     [ rk         , w , i   , w , Upl  ] st1<Vesize>\t%4.d, %5, [%0, %1.d, lsl %p3]
   }
 )
 
@@ -2990,10 +2990,7 @@
 	(vec_duplicate:PRED_ALL (match_operand:QI 1 "register_operand")))]
   "TARGET_SVE"
   {
-    rtx tmp = gen_reg_rtx (DImode);
-    rtx op1 = gen_lowpart (DImode, operands[1]);
-    emit_insn (gen_ashldi3 (tmp, op1, gen_int_mode (63, DImode)));
-    emit_insn (gen_while_ultdi<mode> (operands[0], const0_rtx, tmp));
+    aarch64_emit_sve_pred_vec_duplicate (<MODE>mode, operands[0], operands[1]);
     DONE;
   }
 )
@@ -5605,18 +5602,21 @@
 
 ;; Predicated floating-point operations with merging.
 (define_expand "@cond_<optab><mode>"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F_B16B16
+	   (unspec:SVE_F_B16B16
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F_B16B16 2 "<sve_pred_fp_rhs1_operand>")
-	      (match_operand:SVE_FULL_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")]
+	      (match_operand:SVE_F_B16B16 2 "<sve_pred_fp_rhs1_operand>")
+	      (match_operand:SVE_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")]
 	     SVE_COND_FP_BINARY)
-	   (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
+	   (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
+  {
+    operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]);
+  }
 )
 
 ;; Predicated floating-point operations, merging with the first input.
@@ -5644,14 +5644,14 @@
 )
 
 (define_insn "*cond_<optab><mode>_2_strict"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F_B16B16
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+	      (match_operand:SVE_F_B16B16 2 "register_operand")
+	      (match_operand:SVE_F_B16B16 3 "register_operand")]
 	     SVE_COND_FP_BINARY)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
@@ -5687,14 +5687,14 @@
 )
 
 (define_insn "*cond_<optab><mode>_2_const_strict"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-	(unspec:SVE_FULL_F
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+	(unspec:SVE_F
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F 2 "register_operand")
-	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+	      (match_operand:SVE_F 2 "register_operand")
+	      (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
 	     SVE_COND_FP_BINARY_I1)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
@@ -5730,14 +5730,14 @@
 )
 
 (define_insn "*cond_<optab><mode>_3_strict"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F_B16B16
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+	      (match_operand:SVE_F_B16B16 2 "register_operand")
+	      (match_operand:SVE_F_B16B16 3 "register_operand")]
 	     SVE_COND_FP_BINARY)
 	   (match_dup 3)]
 	  UNSPEC_SEL))]
@@ -5794,16 +5794,16 @@
 )
 
 (define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F_B16B16
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+	      (match_operand:SVE_F_B16B16 2 "register_operand")
+	      (match_operand:SVE_F_B16B16 3 "register_operand")]
 	     SVE_COND_FP_BINARY)
-	   (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
+	   (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE
    && (<supports_bf16> || !<is_bf16>)
@@ -5868,16 +5868,16 @@
 )
 
 (define_insn_and_rewrite "*cond_<optab><mode>_any_const_strict"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-	(unspec:SVE_FULL_F
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+	(unspec:SVE_F
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F 2 "register_operand")
-	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+	      (match_operand:SVE_F 2 "register_operand")
+	      (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
 	     SVE_COND_FP_BINARY_I1)
-	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+	   (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
   {@ [ cons: =0 , 1   , 2 , 4   ]
@@ -5953,14 +5953,14 @@
 )
 
 (define_insn "*cond_add<mode>_2_const_strict"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-	(unspec:SVE_FULL_F
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+	(unspec:SVE_F
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F 2 "register_operand")
-	      (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
+	      (match_operand:SVE_F 2 "register_operand")
+	      (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
 	     UNSPEC_COND_FADD)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
@@ -6015,16 +6015,16 @@
 )
 
 (define_insn_and_rewrite "*cond_add<mode>_any_const_strict"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-	(unspec:SVE_FULL_F
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+	(unspec:SVE_F
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F 2 "register_operand")
-	      (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
+	      (match_operand:SVE_F 2 "register_operand")
+	      (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")]
 	     UNSPEC_COND_FADD)
-	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+	   (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
   {@ [ cons: =0 , 1   , 2 , 3   , 4   ]
@@ -6266,14 +6266,14 @@
 )
 
 (define_insn "*cond_sub<mode>_3_const_strict"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-	(unspec:SVE_FULL_F
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+	(unspec:SVE_F
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
-	      (match_operand:SVE_FULL_F 3 "register_operand")]
+	      (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
+	      (match_operand:SVE_F 3 "register_operand")]
 	     UNSPEC_COND_FSUB)
 	   (match_dup 3)]
 	  UNSPEC_SEL))]
@@ -6323,16 +6323,16 @@
 )
 
 (define_insn_and_rewrite "*cond_sub<mode>_const_strict"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-	(unspec:SVE_FULL_F
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+	(unspec:SVE_F
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
-	      (match_operand:SVE_FULL_F 3 "register_operand")]
+	      (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
+	      (match_operand:SVE_F 3 "register_operand")]
 	     UNSPEC_COND_FSUB)
-	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+	   (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE && !rtx_equal_p (operands[3], operands[4])"
   {@ [ cons: =0 , 1   , 3 , 4   ]
@@ -6913,7 +6913,7 @@
 ;; Predicate AND.  We can reuse one of the inputs as the GP.
 ;; Doubling the second operand is the preferred implementation
 ;; of the MOV alias, so we use that instead of %1/z, %1, %2.
-(define_insn "and<mode>3"
+(define_insn "@and<mode>3"
   [(set (match_operand:PRED_ALL 0 "register_operand")
 	(and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand")
 		      (match_operand:PRED_ALL 2 "register_operand")))]
@@ -7595,29 +7595,29 @@
 
 ;; Unpredicated floating-point ternary operations.
 (define_expand "<optab><mode>4"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
 	  [(match_dup 4)
-	   (const_int SVE_RELAXED_GP)
-	   (match_operand:SVE_FULL_F_B16B16 1 "register_operand")
-	   (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-	   (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+	   (match_dup 5)
+	   (match_operand:SVE_F_B16B16 1 "register_operand")
+	   (match_operand:SVE_F_B16B16 2 "register_operand")
+	   (match_operand:SVE_F_B16B16 3 "register_operand")]
 	  SVE_COND_FP_TERNARY))]
   "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
   {
-    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+    operands[4] = aarch64_sve_fp_pred (<MODE>mode, &operands[5]);
   }
 )
 
 ;; Predicated floating-point ternary operations.
 (define_insn "@aarch64_pred_<optab><mode>"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
-	  [(match_operand:<VPRED> 1 "register_operand")
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
 	   (match_operand:SI 5 "aarch64_sve_gp_strictness")
-	   (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-	   (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
-	   (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+	   (match_operand:SVE_F_B16B16 2 "register_operand")
+	   (match_operand:SVE_F_B16B16 3 "register_operand")
+	   (match_operand:SVE_F_B16B16 4 "register_operand")]
 	  SVE_COND_FP_TERNARY))]
   "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
   {@ [ cons: =0 , 1   , %2  , 3 , 4 ; attrs: movprfx , is_rev ]
@@ -7631,17 +7631,17 @@
 
 ;; Predicated floating-point ternary operations with merging.
 (define_expand "@cond_<optab><mode>"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F_B16B16
+	   (unspec:SVE_F_B16B16
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+	      (match_operand:SVE_F_B16B16 2 "register_operand")
+	      (match_operand:SVE_F_B16B16 3 "register_operand")
+	      (match_operand:SVE_F_B16B16 4 "register_operand")]
 	     SVE_COND_FP_TERNARY)
-	   (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+	   (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
 {
@@ -7649,20 +7649,22 @@
      second of the two.  */
   if (rtx_equal_p (operands[3], operands[5]))
     std::swap (operands[2], operands[3]);
+
+  operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]);
 })
 
 ;; Predicated floating-point ternary operations, merging with the
 ;; first input.
 (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-	(unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+	(unspec:SVE_F
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F
+	   (unspec:SVE_F
 	     [(match_operand 5)
 	      (const_int SVE_RELAXED_GP)
-	      (match_operand:SVE_FULL_F 2 "register_operand")
-	      (match_operand:SVE_FULL_F 3 "register_operand")
-	      (match_operand:SVE_FULL_F 4 "register_operand")]
+	      (match_operand:SVE_F 2 "register_operand")
+	      (match_operand:SVE_F 3 "register_operand")
+	      (match_operand:SVE_F 4 "register_operand")]
 	     SVE_COND_FP_TERNARY)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
@@ -7678,15 +7680,15 @@
 )
 
 (define_insn "*cond_<optab><mode>_2_strict"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-	(unspec:SVE_FULL_F
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+	(unspec:SVE_F
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F 2 "register_operand")
-	      (match_operand:SVE_FULL_F 3 "register_operand")
-	      (match_operand:SVE_FULL_F 4 "register_operand")]
+	      (match_operand:SVE_F 2 "register_operand")
+	      (match_operand:SVE_F 3 "register_operand")
+	      (match_operand:SVE_F 4 "register_operand")]
 	     SVE_COND_FP_TERNARY)
 	   (match_dup 2)]
 	  UNSPEC_SEL))]
@@ -7700,15 +7702,15 @@
 ;; Predicated floating-point ternary operations, merging with the
 ;; third input.
 (define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F_B16B16
+	   (unspec:SVE_F_B16B16
 	     [(match_operand 5)
 	      (const_int SVE_RELAXED_GP)
-	      (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+	      (match_operand:SVE_F_B16B16 2 "register_operand")
+	      (match_operand:SVE_F_B16B16 3 "register_operand")
+	      (match_operand:SVE_F_B16B16 4 "register_operand")]
 	     SVE_COND_FP_TERNARY)
 	   (match_dup 4)]
 	  UNSPEC_SEL))]
@@ -7724,15 +7726,15 @@
 )
 
 (define_insn "*cond_<optab><mode>_4_strict"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F_B16B16
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+	      (match_operand:SVE_F_B16B16 2 "register_operand")
+	      (match_operand:SVE_F_B16B16 3 "register_operand")
+	      (match_operand:SVE_F_B16B16 4 "register_operand")]
 	     SVE_COND_FP_TERNARY)
 	   (match_dup 4)]
 	  UNSPEC_SEL))]
@@ -7746,17 +7748,17 @@
 ;; Predicated floating-point ternary operations, merging with an
 ;; independent value.
 (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
 	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F_B16B16
+	   (unspec:SVE_F_B16B16
 	     [(match_operand 6)
 	      (const_int SVE_RELAXED_GP)
-	      (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+	      (match_operand:SVE_F_B16B16 2 "register_operand")
+	      (match_operand:SVE_F_B16B16 3 "register_operand")
+	      (match_operand:SVE_F_B16B16 4 "register_operand")]
 	     SVE_COND_FP_TERNARY)
-	   (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+	   (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE
    && (<supports_bf16> || !<is_bf16>)
@@ -7792,17 +7794,17 @@
 )
 
 (define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-	(unspec:SVE_FULL_F_B16B16
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+	(unspec:SVE_F_B16B16
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	   (unspec:SVE_F_B16B16
 	     [(match_dup 1)
 	      (const_int SVE_STRICT_GP)
-	      (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
-	      (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+	      (match_operand:SVE_F_B16B16 2 "register_operand")
+	      (match_operand:SVE_F_B16B16 3 "register_operand")
+	      (match_operand:SVE_F_B16B16 4 "register_operand")]
 	     SVE_COND_FP_TERNARY)
-	   (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+	   (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE
    && (<supports_bf16> || !<is_bf16>)
@@ -8201,20 +8203,23 @@
 ;;
 ;; For unpacked vectors, it doesn't really matter whether SEL uses the
 ;; the container size or the element size.  If SEL used the container size,
-;; it would ignore undefined bits of the predicate but would copy the
-;; upper (undefined) bits of each container along with the defined bits.
-;; If SEL used the element size, it would use undefined bits of the predicate
-;; to select between undefined elements in each input vector.  Thus the only
-;; difference is whether the undefined bits in a container always come from
-;; the same input as the defined bits, or whether the choice can vary
-;; independently of the defined bits.
+;; it would would copy the upper (undefined) bits of each container along
+;; with the corresponding defined bits.  If SEL used the element size,
+;; it would use separate predicate bits to select between the undefined
+;; elements in each input vector; these seperate predicate bits might
+;; themselves be undefined, depending on the mode of the predicate.
+;;
+;; Thus the only difference is whether the undefined bits in a container
+;; always come from the same input as the defined bits, or whether the
+;; choice can vary independently of the defined bits.
 ;;
 ;; For the other instructions, using the element size is more natural,
 ;; so we do that for SEL as well.
+;;
 (define_insn "*vcond_mask_<mode><vpred>"
   [(set (match_operand:SVE_ALL 0 "register_operand")
 	(unspec:SVE_ALL
-	  [(match_operand:<VPRED> 3 "register_operand")
+	  [(match_operand:<VPRED> 3 "aarch64_predicate_operand")
 	   (match_operand:SVE_ALL 1 "aarch64_sve_reg_or_dup_imm")
 	   (match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
@@ -8353,6 +8358,71 @@
   }
 )
 
+;; Likewise, but yield a VNx16BI result regardless of the element width.
+;; The .b case is equivalent to the above.
+(define_expand "@aarch64_pred_cmp<cmp_op><mode>_acle"
+  [(parallel
+     [(set (match_operand:<VPRED> 0 "register_operand")
+	   (unspec:<VPRED>
+	     [(match_operand:<VPRED> 1 "register_operand")
+	      (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	      (SVE_INT_CMP:<VPRED>
+		(match_operand:VNx16QI_ONLY 3 "register_operand")
+		(match_operand:VNx16QI_ONLY 4 "aarch64_sve_cmp_<sve_imm_con>_operand"))]
+	     UNSPEC_PRED_Z))
+      (clobber (reg:CC_NZC CC_REGNUM))])]
+  "TARGET_SVE"
+)
+
+;; For wider elements, bitcast the predicate result to a VNx16BI and use
+;; an (and ...) to indicate that only every second, fourth, or eighth bit
+;; is set.
+(define_expand "@aarch64_pred_cmp<cmp_op><mode>_acle"
+  [(parallel
+     [(set (match_operand:VNx16BI 0 "register_operand")
+	   (and:VNx16BI
+	     (subreg:VNx16BI
+	       (unspec:<VPRED>
+		 [(match_operand:<VPRED> 1 "register_operand")
+		  (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+		  (SVE_INT_CMP:<VPRED>
+		    (match_operand:SVE_FULL_HSDI 3 "register_operand")
+		    (match_operand:SVE_FULL_HSDI 4 "aarch64_sve_cmp_<sve_imm_con>_operand"))]
+		 UNSPEC_PRED_Z)
+	       0)
+	     (match_dup 5)))
+      (clobber (reg:CC_NZC CC_REGNUM))])]
+  "TARGET_SVE"
+  {
+    operands[5] = aarch64_ptrue_all (GET_MODE_UNIT_SIZE (<MODE>mode));
+  }
+)
+
+(define_insn "*aarch64_pred_cmp<cmp_op><mode>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_operand:<VPRED> 1 "register_operand")
+	       (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	       (SVE_INT_CMP:<VPRED>
+		 (match_operand:SVE_FULL_HSDI 3 "register_operand")
+		 (match_operand:SVE_FULL_HSDI 4 "aarch64_sve_cmp_<sve_imm_con>_operand"))]
+	      UNSPEC_PRED_Z)
+	    0)
+	  (match_operand:<VPRED> 5 "aarch64_ptrue_all_operand")))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE"
+  {@ [ cons: =0 , 1  , 3 , 4            ; attrs: pred_clobber ]
+     [ &Upa     , Upl, w , <sve_imm_con>; yes                 ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, #%4
+     [ ?Upl     , 0  , w , <sve_imm_con>; yes                 ] ^
+     [ Upa      , Upl, w , <sve_imm_con>; no                  ] ^
+     [ &Upa     , Upl, w , w            ; yes                 ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>
+     [ ?Upl     , 0  , w , w            ; yes                 ] ^
+     [ Upa      , Upl, w , w            ; no                  ] ^
+  }
+)
+
 ;; Predicated integer comparisons in which both the flag and predicate
 ;; results are interesting.
 (define_insn_and_rewrite "*cmp<cmp_op><mode>_cc"
@@ -8394,6 +8464,49 @@
   }
 )
 
+(define_insn_and_rewrite "*cmp<cmp_op><mode>_acle_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:<VPRED>
+	     [(match_operand 6)
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (SVE_INT_CMP:<VPRED>
+		(match_operand:SVE_FULL_HSDI 2 "register_operand")
+		(match_operand:SVE_FULL_HSDI 3 "aarch64_sve_cmp_<sve_imm_con>_operand"))]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_dup 6)
+	       (match_dup 7)
+	       (SVE_INT_CMP:<VPRED>
+		 (match_dup 2)
+		 (match_dup 3))]
+	      UNSPEC_PRED_Z)
+	    0)
+	  (match_operand:<VPRED> 8 "aarch64_ptrue_all_operand")))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+  {@ [ cons: =0 , 1   , 2 , 3            ; attrs: pred_clobber ]
+     [ &Upa     ,  Upl, w , <sve_imm_con>; yes                 ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
+     [ ?Upl     ,  0  , w , <sve_imm_con>; yes                 ] ^
+     [ Upa      ,  Upl, w , <sve_imm_con>; no                  ] ^
+     [ &Upa     ,  Upl, w , w            ; yes                 ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>
+     [ ?Upl     ,  0  , w , w            ; yes                 ] ^
+     [ Upa      ,  Upl, w , w            ; no                  ] ^
+  }
+  "&& !rtx_equal_p (operands[4], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[4]);
+    operands[7] = operands[5];
+  }
+)
+
 ;; Predicated integer comparisons in which only the flags result is
 ;; interesting.
 (define_insn_and_rewrite "*cmp<cmp_op><mode>_ptest"
@@ -8459,14 +8572,52 @@
       (clobber (reg:CC_NZC CC_REGNUM))])]
 )
 
+(define_insn_and_split "*cmp<cmp_op><mode>_acle_and"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa, Upa")
+	(and:VNx16BI
+	  (and:VNx16BI
+	    (subreg:VNx16BI
+	      (unspec:<VPRED>
+		[(match_operand 4)
+		 (const_int SVE_KNOWN_PTRUE)
+		 (SVE_INT_CMP:<VPRED>
+		   (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w")
+		   (match_operand:SVE_FULL_HSDI 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+		UNSPEC_PRED_Z)
+	      0)
+	    (match_operand:VNx16BI 1 "register_operand" "Upl, Upl"))
+          (match_operand:<VPRED> 5 "aarch64_ptrue_all_operand")))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (match_dup 0)
+	   (and:VNx16BI
+	     (subreg:VNx16BI
+	       (unspec:<VPRED>
+		 [(match_dup 1)
+		  (const_int SVE_MAYBE_NOT_PTRUE)
+		  (SVE_INT_CMP:<VPRED>
+		    (match_dup 2)
+		    (match_dup 3))]
+		 UNSPEC_PRED_Z)
+	       0)
+	     (match_dup 5)))
+      (clobber (reg:CC_NZC CC_REGNUM))])]
+  {
+    operands[1] = gen_lowpart (<VPRED>mode, operands[1]);
+  }
+)
+
 ;; Predicated integer wide comparisons.
 (define_insn "@aarch64_pred_cmp<cmp_op><mode>_wide"
   [(set (match_operand:<VPRED> 0 "register_operand")
 	(unspec:<VPRED>
-	  [(match_operand:VNx16BI 1 "register_operand")
+	  [(match_operand:<VPRED> 1 "register_operand")
 	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
 	   (unspec:<VPRED>
-	     [(match_operand:SVE_FULL_BHSI 3 "register_operand")
+	     [(match_operand:VNx16QI_ONLY 3 "register_operand")
 	      (match_operand:VNx2DI 4 "register_operand")]
 	     SVE_COND_INT_CMP_WIDE)]
 	  UNSPEC_PRED_Z))
@@ -8479,16 +8630,61 @@
   }
 )
 
+(define_expand "@aarch64_pred_cmp<cmp_op><mode>_wide"
+  [(parallel
+     [(set (match_operand:VNx16BI 0 "register_operand")
+	   (and:VNx16BI
+	     (subreg:VNx16BI
+	       (unspec:<VPRED>
+		 [(match_operand:<VPRED> 1 "register_operand")
+		  (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+		  (unspec:<VPRED>
+		    [(match_operand:SVE_FULL_HSI 3 "register_operand")
+		     (match_operand:VNx2DI 4 "register_operand")]
+		    SVE_COND_INT_CMP_WIDE)]
+		 UNSPEC_PRED_Z)
+	       0)
+	     (match_dup 5)))
+      (clobber (reg:CC_NZC CC_REGNUM))])]
+  "TARGET_SVE"
+  {
+    operands[5] = aarch64_ptrue_all (GET_MODE_UNIT_SIZE (<MODE>mode));
+  }
+)
+
+(define_insn "*aarch64_pred_cmp<cmp_op><mode>_wide"
+  [(set (match_operand:VNx16BI 0 "register_operand")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_operand:<VPRED> 1 "register_operand")
+	       (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	       (unspec:<VPRED>
+		 [(match_operand:SVE_FULL_HSI 3 "register_operand")
+		  (match_operand:VNx2DI 4 "register_operand")]
+		 SVE_COND_INT_CMP_WIDE)]
+	      UNSPEC_PRED_Z)
+	    0)
+	  (match_operand:<VPRED> 5 "aarch64_ptrue_all_operand")))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE"
+  {@ [ cons: =0, 1   , 2, 3, 4; attrs: pred_clobber ]
+     [ &Upa    ,  Upl,  , w, w; yes                 ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.d
+     [ ?Upl    ,  0  ,  , w, w; yes                 ] ^
+     [ Upa     ,  Upl,  , w, w; no                  ] ^
+  }
+)
+
 ;; Predicated integer wide comparisons in which both the flag and
 ;; predicate results are interesting.
-(define_insn "*aarch64_pred_cmp<cmp_op><mode>_wide_cc"
+(define_insn_and_rewrite "*aarch64_pred_cmp<cmp_op><mode>_wide_cc"
   [(set (reg:CC_NZC CC_REGNUM)
 	(unspec:CC_NZC
 	  [(match_operand:VNx16BI 1 "register_operand")
 	   (match_operand 4)
 	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
 	   (unspec:<VPRED>
-	     [(match_operand:VNx16BI 6 "register_operand")
+	     [(match_operand:<VPRED> 6 "register_operand")
 	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
 	      (unspec:<VPRED>
 		[(match_operand:SVE_FULL_BHSI 2 "register_operand")
@@ -8512,18 +8708,65 @@
      [ ?Upl    ,  0  , w, w, Upl; yes                 ] ^
      [ Upa     ,  Upl, w, w, Upl; no                  ] ^
   }
+  "&& !rtx_equal_p (operands[4], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[4]);
+    operands[7] = operands[5];
+  }
+)
+
+(define_insn_and_rewrite "*aarch64_pred_cmp<cmp_op><mode>_wide_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:<VPRED>
+	     [(match_operand:<VPRED> 6 "register_operand")
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (unspec:<VPRED>
+		[(match_operand:SVE_FULL_HSI 2 "register_operand")
+		 (match_operand:VNx2DI 3 "register_operand")]
+		SVE_COND_INT_CMP_WIDE)]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_dup 6)
+	       (match_dup 7)
+	       (unspec:<VPRED>
+		 [(match_dup 2)
+		  (match_dup 3)]
+		 SVE_COND_INT_CMP_WIDE)]
+	      UNSPEC_PRED_Z)
+	    0)
+	  (match_operand:<VPRED> 8 "aarch64_ptrue_all_operand")))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+  {@ [ cons: =0, 1   , 2, 3, 6  ; attrs: pred_clobber ]
+     [ &Upa    ,  Upl, w, w, Upl; yes                 ] cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d
+     [ ?Upl    ,  0  , w, w, Upl; yes                 ] ^
+     [ Upa     ,  Upl, w, w, Upl; no                  ] ^
+  }
+  "&& !rtx_equal_p (operands[4], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[4]);
+    operands[7] = operands[5];
+  }
 )
 
 ;; Predicated integer wide comparisons in which only the flags result
 ;; is interesting.
-(define_insn "*aarch64_pred_cmp<cmp_op><mode>_wide_ptest"
+(define_insn_and_rewrite "*aarch64_pred_cmp<cmp_op><mode>_wide_ptest"
   [(set (reg:CC_NZC CC_REGNUM)
 	(unspec:CC_NZC
 	  [(match_operand:VNx16BI 1 "register_operand")
 	   (match_operand 4)
 	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
 	   (unspec:<VPRED>
-	     [(match_operand:VNx16BI 6 "register_operand")
+	     [(match_operand:<VPRED> 6 "register_operand")
 	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
 	      (unspec:<VPRED>
 		[(match_operand:SVE_FULL_BHSI 2 "register_operand")
@@ -8539,6 +8782,11 @@
      [ ?Upl     ,  0  , w, w, Upl; yes                 ] ^
      [ Upa      ,  Upl, w, w, Upl; no                  ] ^
   }
+  "&& !rtx_equal_p (operands[4], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[4]);
+    operands[7] = operands[5];
+  }
 )
 
 ;; -------------------------------------------------------------------------
@@ -8576,6 +8824,58 @@
   "while<cmp_op>\t%0.<PRED_ALL:Vetype>, %<w>1, %<w>2"
 )
 
+;; Likewise, but yield a VNx16BI result regardless of the element width.
+;; The .b case is equivalent to the above.
+(define_expand "@aarch64_sve_while_<while_optab_cmp><GPI:mode><VNx16BI_ONLY:mode>_acle"
+  [(parallel
+     [(set (match_operand:VNx16BI_ONLY 0 "register_operand")
+	   (unspec:VNx16BI_ONLY
+	     [(const_int SVE_WHILE_B)
+	      (match_operand:GPI 1 "aarch64_reg_or_zero")
+	      (match_operand:GPI 2 "aarch64_reg_or_zero")]
+	     SVE_WHILE))
+      (clobber (reg:CC_NZC CC_REGNUM))])]
+  "TARGET_SVE"
+)
+
+;; For wider elements, bitcast the predicate result to a VNx16BI and use
+;; an (and ...) to indicate that only every second, fourth, or eighth bit
+;; is set.
+(define_expand "@aarch64_sve_while_<while_optab_cmp><GPI:mode><PRED_HSD:mode>_acle"
+  [(parallel
+     [(set (match_operand:VNx16BI 0 "register_operand")
+	   (and:VNx16BI
+	     (subreg:VNx16BI
+	       (unspec:PRED_HSD
+		 [(const_int SVE_WHILE_B)
+		  (match_operand:GPI 1 "aarch64_reg_or_zero")
+		  (match_operand:GPI 2 "aarch64_reg_or_zero")]
+		 SVE_WHILE)
+	       0)
+	     (match_dup 3)))
+      (clobber (reg:CC_NZC CC_REGNUM))])]
+  "TARGET_SVE"
+  {
+    operands[3] = aarch64_ptrue_all (<data_bytes>);
+  }
+)
+
+(define_insn "*aarch64_sve_while_<while_optab_cmp><GPI:mode><PRED_HSD:mode>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:PRED_HSD
+	      [(const_int SVE_WHILE_B)
+	       (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
+	       (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
+	      SVE_WHILE)
+	    0)
+	  (match_operand:PRED_HSD 3 "aarch64_ptrue_all_operand")))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE"
+  "while<cmp_op>\t%0.<PRED_HSD:Vetype>, %<w>1, %<w>2"
+)
+
 ;; The WHILE instructions set the flags in the same way as a PTEST with
 ;; a PTRUE GP.  Handle the case in which both results are useful.  The GP
 ;; operands to the PTEST aren't needed, so we allow them to be anything.
@@ -8607,6 +8907,38 @@
   }
 )
 
+(define_insn_and_rewrite "*while_<while_optab_cmp><GPI:mode><PRED_HSD:mode>_acle_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand 3)
+	   (match_operand 4)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (unspec:PRED_HSD
+	     [(const_int SVE_WHILE_B)
+	      (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
+	      (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
+	     SVE_WHILE)]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:PRED_HSD [(const_int SVE_WHILE_B)
+			      (match_dup 1)
+			      (match_dup 2)]
+			     SVE_WHILE)
+	    0)
+	  (match_operand:PRED_HSD 5 "aarch64_ptrue_all_operand")))]
+  "TARGET_SVE"
+  "while<cmp_op>\t%0.<PRED_HSD:Vetype>, %<w>1, %<w>2"
+  ;; Force the compiler to drop the unused predicate operand, so that we
+  ;; don't have an unnecessary PTRUE.
+  "&& (!CONSTANT_P (operands[3]) || !CONSTANT_P (operands[4]))"
+  {
+    operands[3] = CONSTM1_RTX (VNx16BImode);
+    operands[4] = CONSTM1_RTX (<PRED_HSD:MODE>mode);
+  }
+)
+
 ;; Same, but handle the case in which only the flags result is useful.
 (define_insn_and_rewrite "@while_<while_optab_cmp><GPI:mode><PRED_ALL:mode>_ptest"
   [(set (reg:CC_NZC CC_REGNUM)
@@ -8677,6 +9009,43 @@
   }
 )
 
+(define_expand "@aarch64_pred_fcm<cmp_op><mode>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	       (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	       (match_operand:SVE_F 3 "register_operand")
+	       (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
+	      SVE_COND_FP_CMP_I0)
+	    0)
+	  (match_dup 5)))]
+  "TARGET_SVE"
+  {
+    operands[5] = aarch64_ptrue_all (GET_MODE_UNIT_SIZE (<MODE>mode));
+  }
+)
+
+(define_insn "*aarch64_pred_fcm<cmp_op><mode>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	       (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	       (match_operand:SVE_F 3 "register_operand")
+	       (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
+	      SVE_COND_FP_CMP_I0)
+	    0)
+	  (match_operand:<VPRED> 5 "aarch64_ptrue_all_operand")))]
+  "TARGET_SVE"
+  {@ [ cons: =0 , 1   , 3 , 4   ]
+     [ Upa      , Upl , w , Dz  ] fcm<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, #0.0
+     [ Upa      , Upl , w , w   ] fcm<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>
+  }
+)
+
 ;; Same for unordered comparisons.
 (define_insn "@aarch64_pred_fcmuo<mode>"
   [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
@@ -8690,6 +9059,40 @@
   "fcmuo\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
 )
 
+(define_expand "@aarch64_pred_fcmuo<mode>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
+	       (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	       (match_operand:SVE_F 3 "register_operand")
+	       (match_operand:SVE_F 4 "register_operand")]
+	      UNSPEC_COND_FCMUO)
+	    0)
+	  (match_dup 5)))]
+  "TARGET_SVE"
+  {
+    operands[5] = aarch64_ptrue_all (GET_MODE_UNIT_SIZE (<MODE>mode));
+  }
+)
+
+(define_insn "*aarch64_pred_fcmuo<mode>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_operand:<VPRED> 1 "aarch64_predicate_operand" "Upl")
+	       (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	       (match_operand:SVE_F 3 "register_operand" "w")
+	       (match_operand:SVE_F 4 "register_operand" "w")]
+	      UNSPEC_COND_FCMUO)
+	    0)
+	  (match_operand:<VPRED> 5 "aarch64_ptrue_all_operand")))]
+  "TARGET_SVE"
+  "fcmuo\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
+)
+
 ;; Floating-point comparisons predicated on a PTRUE, with the results ANDed
 ;; with another predicate P.  This does not have the same trapping behavior
 ;; as predicating the comparison itself on P, but it's a legitimate fold,
@@ -8908,23 +9311,30 @@
 ;; -------------------------------------------------------------------------
 
 ;; Predicated floating-point absolute comparisons.
-(define_expand "@aarch64_pred_fac<cmp_op><mode>"
-  [(set (match_operand:<VPRED> 0 "register_operand")
-	(unspec:<VPRED>
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-	   (unspec:SVE_FULL_F
-	     [(match_dup 1)
-	      (match_dup 2)
-	      (match_operand:SVE_FULL_F 3 "register_operand")]
-	     UNSPEC_COND_FABS)
-	   (unspec:SVE_FULL_F
-	     [(match_dup 1)
-	      (match_dup 2)
-	      (match_operand:SVE_FULL_F 4 "register_operand")]
-	     UNSPEC_COND_FABS)]
-	  SVE_COND_FP_ABS_CMP))]
+(define_expand "@aarch64_pred_fac<cmp_op><mode>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_operand:<VPRED> 1 "register_operand")
+	       (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	       (unspec:SVE_FULL_F
+		 [(match_dup 1)
+		  (match_dup 2)
+		  (match_operand:SVE_FULL_F 3 "register_operand")]
+		 UNSPEC_COND_FABS)
+	       (unspec:SVE_FULL_F
+		 [(match_dup 1)
+		  (match_dup 2)
+		  (match_operand:SVE_FULL_F 4 "register_operand")]
+		 UNSPEC_COND_FABS)]
+	      SVE_COND_FP_ABS_CMP)
+	    0)
+	  (match_dup 5)))]
   "TARGET_SVE"
+  {
+    operands[5] = aarch64_ptrue_all (GET_MODE_UNIT_SIZE (<MODE>mode));
+  }
 )
 
 (define_insn_and_rewrite "*aarch64_pred_fac<cmp_op><mode>_relaxed"
@@ -8973,6 +9383,30 @@
   "fac<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
 )
 
+(define_insn "*aarch64_pred_fac<cmp_op><mode>_strict_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	       (match_operand:SI 4 "aarch64_sve_ptrue_flag")
+	       (unspec:SVE_FULL_F
+		 [(match_dup 1)
+		  (match_operand:SI 5 "aarch64_sve_gp_strictness")
+		  (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+		 UNSPEC_COND_FABS)
+	       (unspec:SVE_FULL_F
+		 [(match_dup 1)
+		  (match_operand:SI 6 "aarch64_sve_gp_strictness")
+		  (match_operand:SVE_FULL_F 3 "register_operand" "w")]
+		 UNSPEC_COND_FABS)]
+	      SVE_COND_FP_ABS_CMP)
+	    0)
+	  (match_operand:<VPRED> 7 "aarch64_ptrue_all_operand")))]
+  "TARGET_SVE"
+  "fac<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [PRED] Select
 ;; -------------------------------------------------------------------------
@@ -9421,7 +9855,30 @@
 	(unspec:PRED_ALL [(match_operand:PRED_ALL 1 "register_operand" "Upa")]
 			 UNSPEC_REV))]
   "TARGET_SVE"
-  "rev\t%0.<Vetype>, %1.<Vetype>")
+  "rev\t%0.<Vetype>, %1.<Vetype>"
+)
+
+(define_expand "@aarch64_sve_rev<mode>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand")
+	(unspec:VNx16BI
+	  [(match_operand:VNx16BI 1 "register_operand")
+	   (match_dup:PRED_ALL 2)]
+	  UNSPEC_REV_PRED))]
+  "TARGET_SVE"
+  {
+    operands[2] = CONST0_RTX (<MODE>mode);
+  }
+)
+
+(define_insn "*aarch64_sve_rev<mode>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(unspec:VNx16BI
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand:PRED_ALL 2 "aarch64_simd_imm_zero")]
+	  UNSPEC_REV_PRED))]
+  "TARGET_SVE"
+  "rev\t%0.<Vetype>, %1.<Vetype>"
+)
 
 ;; -------------------------------------------------------------------------
 ;; ---- [PRED] Special-purpose binary permutes
@@ -9446,18 +9903,39 @@
   "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
-;; Special purpose permute used by the predicate generation instructions.
-;; Unlike the normal permute patterns, these instructions operate on VNx16BI
-;; regardless of the element size, so that all input and output bits are
-;; well-defined.  Operand 3 then indicates the size of the permute.
-(define_insn "@aarch64_sve_trn1_conv<mode>"
+;; Special-purpose permutes used by the ACLE intrinsics and predicate
+;; generation instructions.  Unlike the normal permute patterns, these
+;; instructions operate on VNx16BI regardless of the element size, so that
+;; all input and output bits are well-defined.  Operand 3 then indicates
+;; the size of the permute.
+;;
+;; To make generation easier, this pattern embeds the permute type as the
+;; fourth operand to the unspec.  On the one hand, this avoids overloading
+;; unspecs like UNSPEC_ZIP1 to represent two different operations.  On the
+;; other hand, it avoids having a separate unspec for each variant, and
+;; having to map from one kind of unspec to the other.
+(define_expand "@aarch64_sve_<perm_insn><mode>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand")
+	(unspec:VNx16BI [(match_operand:VNx16BI 1 "register_operand")
+			 (match_operand:VNx16BI 2 "register_operand")
+			 (match_dup:PRED_ALL 3)
+			 (const_int PERMUTE)]
+			UNSPEC_PERMUTE_PRED))]
+  "TARGET_SVE"
+  {
+    operands[3] = CONST0_RTX (<MODE>mode);
+  }
+)
+
+(define_insn "*aarch64_sve_<perm_insn><mode>_acle"
   [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
 	(unspec:VNx16BI [(match_operand:VNx16BI 1 "register_operand" "Upa")
 			 (match_operand:VNx16BI 2 "register_operand" "Upa")
-			 (match_operand:PRED_ALL 3 "aarch64_simd_imm_zero")]
-			UNSPEC_TRN1_CONV))]
+			 (match_operand:PRED_ALL 3 "aarch64_simd_imm_zero")
+			 (const_int PERMUTE)]
+			UNSPEC_PERMUTE_PRED))]
   "TARGET_SVE"
-  "trn1\t%0.<PRED_ALL:Vetype>, %1.<PRED_ALL:Vetype>, %2.<PRED_ALL:Vetype>"
+  "<perm_insn>\t%0.<PRED_ALL:Vetype>, %1.<PRED_ALL:Vetype>, %2.<PRED_ALL:Vetype>"
 )
 
 ;; =========================================================================
@@ -10446,6 +10924,34 @@
   "punpk<perm_hilo>\t%0.h, %1.b"
 )
 
+(define_expand "@aarch64_sve_punpk<perm_hilo>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:VNx8BI
+	      [(match_operand:VNx16BI 1 "register_operand")]
+	      UNPACK_UNSIGNED)
+	    0)
+	  (match_dup 2)))]
+  "TARGET_SVE"
+  {
+    operands[2] = aarch64_ptrue_all (2);
+  }
+)
+
+(define_insn "*aarch64_sve_punpk<perm_hilo>_acle"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:VNx8BI
+	      [(match_operand:VNx16BI 1 "register_operand" "Upa")]
+	      UNPACK_UNSIGNED)
+	    0)
+	  (match_operand:VNx8BI 2 "aarch64_ptrue_all_operand")))]
+  "TARGET_SVE"
+  "punpk<perm_hilo>\t%0.h, %1.b"
+)
+
 ;; =========================================================================
 ;; == Vector partitioning
 ;; =========================================================================
@@ -10670,14 +11176,49 @@
 ;; -------------------------------------------------------------------------
 
 (define_insn "@aarch64_sve_<sve_pred_op><mode>"
-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
-	(unspec:PRED_ALL
-	  [(match_operand:PRED_ALL 1 "register_operand" "Upa")
+  [(set (match_operand:VNx16BI_ONLY 0 "register_operand" "=Upa")
+	(unspec:VNx16BI_ONLY
+	  [(match_operand:VNx16BI_ONLY 1 "register_operand" "Upa")
 	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-	   (match_operand:PRED_ALL 3 "register_operand" "0")]
+	   (match_operand:VNx16BI_ONLY 3 "register_operand" "0")]
 	  SVE_PITER))
    (clobber (reg:CC_NZC CC_REGNUM))]
-  "TARGET_SVE && <max_elem_bits> >= <elem_bits>"
+  "TARGET_SVE"
+  "<sve_pred_op>\t%0.<Vetype>, %1, %0.<Vetype>"
+)
+
+(define_expand "@aarch64_sve_<sve_pred_op><mode>"
+  [(parallel
+     [(set (match_operand:VNx16BI 0 "register_operand")
+	   (and:VNx16BI
+	     (subreg:VNx16BI
+	       (unspec:PRED_HSD
+		 [(match_operand:PRED_HSD 1 "register_operand")
+		  (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+		  (match_operand:PRED_HSD 3 "register_operand")]
+		 PNEXT_ONLY)
+	       0)
+	     (match_dup 4)))
+      (clobber (reg:CC_NZC CC_REGNUM))])]
+  "TARGET_SVE"
+  {
+    operands[4] = aarch64_ptrue_all (<data_bytes>);
+  }
+)
+
+(define_insn "*aarch64_sve_<sve_pred_op><mode>"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:PRED_HSD
+	      [(match_operand:PRED_HSD 1 "register_operand" "Upa")
+	       (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	       (match_operand:PRED_HSD 3 "register_operand" "0")]
+	      PNEXT_ONLY)
+	    0)
+	  (match_operand:PRED_HSD 4 "aarch64_ptrue_all_operand")))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE"
   "<sve_pred_op>\t%0.<Vetype>, %1, %0.<Vetype>"
 )
 
@@ -10711,6 +11252,38 @@
   }
 )
 
+(define_insn_and_rewrite "*aarch64_sve_<sve_pred_op><mode>_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand 2)
+	   (match_operand:SI 3 "aarch64_sve_ptrue_flag")
+	   (unspec:PRED_HSD
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	      (match_operand:PRED_HSD 6 "register_operand" "0")]
+	     PNEXT_ONLY)]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:PRED_HSD
+	      [(match_dup 4)
+	       (match_dup 5)
+	       (match_dup 6)]
+	      PNEXT_ONLY)
+	    0)
+	  (match_operand:PRED_HSD 7 "aarch64_ptrue_all_operand")))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[2], &operands[4])"
+  "<sve_pred_op>\t%0.<Vetype>, %1, %0.<Vetype>"
+  "&& !rtx_equal_p (operands[2], operands[4])"
+  {
+    operands[4] = operands[2];
+    operands[5] = operands[3];
+  }
+)
+
 ;; Same, but with only the flags result being interesting.
 (define_insn_and_rewrite "*aarch64_sve_<sve_pred_op><mode>_ptest"
   [(set (reg:CC_NZC CC_REGNUM)
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 31bdd85..a3cbbce 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -2211,14 +2211,14 @@
 ;; - FDOT (2-way, indexed) (FP8DOT2)
 ;; -------------------------------------------------------------------------
 (define_insn "@aarch64_sve_dot<mode>"
-  [(set (match_operand:SVE_FULL_HSF 0 "register_operand")
-	(unspec:SVE_FULL_HSF
-	  [(match_operand:SVE_FULL_HSF 1 "register_operand")
+  [(set (match_operand:SVE_FULL_HSF_FP8_FDOT 0 "register_operand")
+	(unspec:SVE_FULL_HSF_FP8_FDOT
+	  [(match_operand:SVE_FULL_HSF_FP8_FDOT 1 "register_operand")
 	   (match_operand:VNx16QI 2 "register_operand")
 	   (match_operand:VNx16QI 3 "register_operand")
 	   (reg:DI FPM_REGNUM)]
 	  UNSPEC_DOT_FP8))]
-  "TARGET_SSVE_FP8DOT4 && !(<MODE>mode == VNx8HFmode && !TARGET_SSVE_FP8DOT2)"
+  ""
   {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
      [ w        , 0 , w , w ; *              ] fdot\t%0.<Vetype>, %2.b, %3.b
      [ ?&w      , w , w , w ; yes            ] movprfx\t%0, %1\;fdot\t%0.<Vetype>, %2.b, %3.b
@@ -2226,15 +2226,15 @@
 )
 
 (define_insn "@aarch64_sve_dot_lane<mode>"
-  [(set (match_operand:SVE_FULL_HSF 0 "register_operand")
-	(unspec:SVE_FULL_HSF
-	  [(match_operand:SVE_FULL_HSF 1 "register_operand")
+  [(set (match_operand:SVE_FULL_HSF_FP8_FDOT 0 "register_operand")
+	(unspec:SVE_FULL_HSF_FP8_FDOT
+	  [(match_operand:SVE_FULL_HSF_FP8_FDOT 1 "register_operand")
 	   (match_operand:VNx16QI 2 "register_operand")
 	   (match_operand:VNx16QI 3 "register_operand")
 	   (match_operand:SI 4 "const_int_operand")
 	   (reg:DI FPM_REGNUM)]
 	  UNSPEC_DOT_LANE_FP8))]
-  "TARGET_SSVE_FP8DOT4 && !(<MODE>mode == VNx8HFmode && !TARGET_SSVE_FP8DOT2)"
+  ""
   {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
      [ w        , 0 , w , y ; *              ] fdot\t%0.<Vetype>, %2.b, %3.b[%4]
      [ ?&w      , w , w , y ; yes            ] movprfx\t%0, %1\;fdot\t%0.<Vetype>, %2.b, %3.b[%4]
@@ -4068,8 +4068,8 @@
 	  [(match_operand:<VPRED> 1 "register_operand")
 	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
 	   (unspec:<VPRED>
-	     [(match_operand:SVE_FULL_BHI 3 "register_operand")
-	      (match_operand:SVE_FULL_BHI 4 "register_operand")]
+	     [(match_operand:VNx16QI_ONLY 3 "register_operand")
+	      (match_operand:VNx16QI_ONLY 4 "register_operand")]
 	     SVE2_MATCH)]
 	  UNSPEC_PRED_Z))
    (clobber (reg:CC_NZC CC_REGNUM))]
@@ -4081,6 +4081,51 @@
   }
 )
 
+(define_expand "@aarch64_pred_<sve_int_op><mode>"
+  [(parallel
+     [(set (match_operand:VNx16BI 0 "register_operand")
+	   (and:VNx16BI
+	     (subreg:VNx16BI
+	       (unspec:<VPRED>
+		 [(match_operand:<VPRED> 1 "register_operand")
+		  (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+		  (unspec:<VPRED>
+		    [(match_operand:VNx8HI_ONLY 3 "register_operand")
+		     (match_operand:VNx8HI_ONLY 4 "register_operand")]
+		    SVE2_MATCH)]
+		 UNSPEC_PRED_Z)
+	       0)
+	     (match_dup 5)))
+      (clobber (reg:CC_NZC CC_REGNUM))])]
+  "TARGET_SVE2 && TARGET_NON_STREAMING"
+  {
+    operands[5] = aarch64_ptrue_all (GET_MODE_UNIT_SIZE (<MODE>mode));
+  }
+)
+
+(define_insn "*aarch64_pred_<sve_int_op><mode>"
+  [(set (match_operand:VNx16BI 0 "register_operand")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_operand:<VPRED> 1 "register_operand")
+	       (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	       (unspec:<VPRED>
+		 [(match_operand:VNx8HI_ONLY 3 "register_operand")
+		  (match_operand:VNx8HI_ONLY 4 "register_operand")]
+		 SVE2_MATCH)]
+	      UNSPEC_PRED_Z)
+	    0)
+	  (match_operand:<VPRED> 5 "aarch64_ptrue_all_operand")))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE2 && TARGET_NON_STREAMING"
+  {@ [ cons: =0, 1  , 3, 4; attrs: pred_clobber ]
+     [ &Upa    , Upl, w, w; yes                 ] <sve_int_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>
+     [ ?Upl    , 0  , w, w; yes                 ] ^
+     [ Upa     , Upl, w, w; no                  ] ^
+  }
+)
+
 ;; Predicated string matching in which both the flag and predicate results
 ;; are interesting.
 (define_insn_and_rewrite "*aarch64_pred_<sve_int_op><mode>_cc"
@@ -4118,6 +4163,45 @@
   }
 )
 
+(define_insn_and_rewrite "*aarch64_pred_<sve_int_op><mode>_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:<VPRED>
+	     [(match_operand 6)
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (unspec:<VPRED>
+		[(match_operand:VNx8HI_ONLY 2 "register_operand" "w")
+		 (match_operand:VNx8HI_ONLY 3 "register_operand" "w")]
+		SVE2_MATCH)]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(and:VNx16BI
+	  (subreg:VNx16BI
+	    (unspec:<VPRED>
+	      [(match_dup 6)
+	       (match_dup 7)
+	       (unspec:<VPRED>
+		 [(match_dup 2)
+		  (match_dup 3)]
+		 SVE2_MATCH)]
+	      UNSPEC_PRED_Z)
+	    0)
+	  (match_operand:<VPRED> 8 "aarch64_ptrue_all_operand")))]
+  "TARGET_SVE2
+   && TARGET_NON_STREAMING
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+  "<sve_int_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+  "&& !rtx_equal_p (operands[4], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[4]);
+    operands[7] = operands[5];
+  }
+)
+
 ;; Predicated string matching in which only the flags result is interesting.
 (define_insn_and_rewrite "*aarch64_pred_<sve_int_op><mode>_ptest"
   [(set (reg:CC_NZC CC_REGNUM)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4d9d83d..ef9c165 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -430,6 +430,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 #include "tuning_models/neoversev2.h"
 #include "tuning_models/neoversev3.h"
 #include "tuning_models/neoversev3ae.h"
+#include "tuning_models/olympus.h"
 #include "tuning_models/a64fx.h"
 #include "tuning_models/fujitsu_monaka.h"
 
@@ -974,19 +975,24 @@ aarch64_cb_rhs (rtx_code op_code, rtx rhs)
     {
     case EQ:
     case NE:
-    case GT:
-    case GTU:
     case LT:
     case LTU:
+    case GE:
+    case GEU:
+      /* EQ/NE  range is 0 .. 63.
+	 LT/LTU range is 0 .. 63.
+	 GE/GEU range is 1 .. 64 => GT x - 1, but also supports 0 via XZR.
+	 So the intersection is 0 .. 63. */
       return IN_RANGE (rhs_val, 0, 63);
 
-    case GE:  /* CBGE:   signed greater than or equal */
-    case GEU: /* CBHS: unsigned greater than or equal */
-      return IN_RANGE (rhs_val, 1, 64);
-
-    case LE:  /* CBLE:   signed less than or equal */
-    case LEU: /* CBLS: unsigned less than or equal */
-      return IN_RANGE (rhs_val, -1, 62);
+    case GT:
+    case GTU:
+    case LE:
+    case LEU:
+      /* GT/GTU range is  0 .. 63
+	 LE/LEU range is -1 .. 62 => LT x + 1.
+	 So the intersection is 0 .. 62. */
+      return IN_RANGE (rhs_val, 0, 62);
 
     default:
       return false;
@@ -2881,10 +2887,47 @@ aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
   return aarch64_gen_compare_reg (code, x, y);
 }
 
+/* Split IMM into two 12-bit halves, producing an EQ/NE comparison vs X.
+   TMP may be a scratch.  This optimizes a sequence from
+	mov	x0, #imm1
+	movk	x0, #imm2, lsl 16  -- x0 contains CST
+	cmp	x1, x0
+   into the shorter:
+	sub	tmp, x1, #(CST & 0xfff000)
+	subs	tmp, tmp, #(CST & 0x000fff)
+*/
+rtx
+aarch64_gen_compare_split_imm24 (rtx x, rtx imm, rtx tmp)
+{
+  HOST_WIDE_INT lo_imm = UINTVAL (imm) & 0xfff;
+  HOST_WIDE_INT hi_imm = UINTVAL (imm) & 0xfff000;
+  enum machine_mode mode = GET_MODE (x);
+
+  if (GET_CODE (tmp) == SCRATCH)
+    tmp = gen_reg_rtx (mode);
+
+  emit_insn (gen_add3_insn (tmp, x, GEN_INT (-hi_imm)));
+  /* TODO: We don't need the gpr result of the second insn. */
+  switch (mode)
+    {
+    case SImode:
+      tmp = gen_addsi3_compare0 (tmp, tmp, GEN_INT (-lo_imm));
+      break;
+    case DImode:
+      tmp = gen_adddi3_compare0 (tmp, tmp, GEN_INT (-lo_imm));
+      break;
+    default:
+      abort ();
+    }
+  emit_insn (tmp);
+
+  return gen_rtx_REG (CC_NZmode, CC_REGNUM);
+}
+
 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
    Return the jump instruction.  */
 
-static rtx
+rtx
 aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
 				     rtx_code_label *label)
 {
@@ -3932,18 +3975,53 @@ aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness)
    return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode));
 }
 
+/* PRED is a predicate that governs an operation on DATA_MODE.  If DATA_MODE
+   is a partial vector mode, and if exceptions must be suppressed for its
+   undefined elements, convert PRED from a container-level predicate to
+   an element-level predicate and ensure that the undefined elements
+   are inactive.  Make no changes otherwise.
+
+   Return the resultant predicate.  */
+rtx
+aarch64_sve_emit_masked_fp_pred (machine_mode data_mode, rtx pred)
+{
+  unsigned int vec_flags = aarch64_classify_vector_mode (data_mode);
+  if (flag_trapping_math && (vec_flags & VEC_PARTIAL))
+    {
+      /* Generate an element-level mask.  */
+      rtx mask = aarch64_sve_packed_pred (data_mode);
+      machine_mode pmode = GET_MODE (mask);
+
+      /* Apply the existing predicate.  */
+      rtx dst = gen_reg_rtx (pmode);
+      emit_insn (gen_and3 (pmode, dst, mask,
+			   gen_lowpart (pmode, pred)));
+      return dst;
+    }
+
+  return pred;
+}
+
 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
-   Use TARGET as the target register if nonnull and convenient.  */
+   Use TARGET as the target register if nonnull and convenient.
+
+   PRED_MODE can be either VNx16BI or the natural predicate mode for
+   DATA_MODE.  */
 
 static rtx
 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
 			  machine_mode data_mode, rtx op1, rtx op2)
 {
-  insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
+  auto src_pred_mode = aarch64_sve_pred_mode (data_mode);
+  insn_code icode;
+  if (known_eq (GET_MODE_NUNITS (pred_mode), GET_MODE_NUNITS (data_mode)))
+    icode = code_for_aarch64_pred_cmp (cmp, data_mode);
+  else
+    icode = code_for_aarch64_pred_cmp_acle (cmp, data_mode);
   expand_operand ops[5];
   create_output_operand (&ops[0], target, pred_mode);
-  create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
+  create_input_operand (&ops[1], CONSTM1_RTX (src_pred_mode), src_pred_mode);
   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
   create_input_operand (&ops[3], op1, data_mode);
   create_input_operand (&ops[4], op2, data_mode);
@@ -3951,15 +4029,14 @@ aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
   return ops[0].value;
 }
 
-/* Use a comparison to convert integer vector SRC into MODE, which is
-   the corresponding SVE predicate mode.  Use TARGET for the result
-   if it's nonnull and convenient.  */
+/* Use a comparison to convert integer vector SRC into VNx16BI.
+   Use TARGET for the result if it's nonnull and convenient.  */
 
 rtx
-aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
+aarch64_convert_sve_data_to_pred (rtx target, rtx src)
 {
   machine_mode src_mode = GET_MODE (src);
-  return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
+  return aarch64_sve_emit_int_cmp (target, VNx16BImode, NE, src_mode,
 				   src, CONST0_RTX (src_mode));
 }
 
@@ -6041,9 +6118,9 @@ aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
 				 unsigned int vl)
 {
   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
-  target = aarch64_target_reg (target, mode);
-  emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
-			target, const0_rtx, limit));
+  target = aarch64_target_reg (target, VNx16BImode);
+  emit_insn (gen_aarch64_sve_while_acle (UNSPEC_WHILELO, DImode, mode,
+					 target, const0_rtx, limit));
   return target;
 }
 
@@ -6189,8 +6266,7 @@ aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
      operands but permutes them as though they had mode MODE.  */
   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
   target = aarch64_target_reg (target, GET_MODE (a));
-  rtx type_reg = CONST0_RTX (mode);
-  emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
+  emit_insn (gen_aarch64_sve_acle (UNSPEC_TRN1, mode, target, a, b));
   return target;
 }
 
@@ -6272,8 +6348,7 @@ aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
     int_builder.quick_push (INTVAL (builder.elt (i))
 			    ? constm1_rtx : const0_rtx);
-  return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
-					   int_builder.build ());
+  return aarch64_convert_sve_data_to_pred (target, int_builder.build ());
 }
 
 /* Set DEST to immediate IMM.  */
@@ -6725,6 +6800,27 @@ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
 			       dest, ptrue, src));
 }
 
+/* Set predicate register DEST such that every element has the scalar
+   boolean value in SRC, with any nonzero source counting as "true".
+   MODE is a MODE_VECTOR_BOOL that determines the element size;
+   DEST can have this mode or VNx16BImode.  In the latter case,
+   the upper bits of each element are defined to be zero, as for
+   the .H, .S, and .D forms of PTRUE.  */
+
+void
+aarch64_emit_sve_pred_vec_duplicate (machine_mode mode, rtx dest, rtx src)
+{
+  rtx tmp = gen_reg_rtx (DImode);
+  emit_insn (gen_ashldi3 (tmp, gen_lowpart (DImode, src),
+			  gen_int_mode (63, DImode)));
+  if (GET_MODE (dest) == VNx16BImode)
+    emit_insn (gen_aarch64_sve_while_acle (UNSPEC_WHILELO, DImode, mode,
+					   dest, const0_rtx, tmp));
+  else
+    emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
+			  dest, const0_rtx, tmp));
+}
+
 static bool
 aarch64_function_ok_for_sibcall (tree, tree exp)
 {
@@ -14326,42 +14422,58 @@ aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
     {
       /* Conditional branch.  */
-      if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
+      enum machine_mode cmpmode = GET_MODE (inner);
+      if (GET_MODE_CLASS (cmpmode) == MODE_CC)
 	return true;
-      else
+
+      if (comparator == const0_rtx)
 	{
-	  if (cmpcode == NE || cmpcode == EQ)
+	  switch (cmpcode)
 	    {
-	      if (comparator == const0_rtx)
+	    case NE:
+	    case EQ:
+	      if (cmpmode != SImode && cmpmode != DImode)
+		break;
+	      if (GET_CODE (inner) == ZERO_EXTRACT)
 		{
-		  /* TBZ/TBNZ/CBZ/CBNZ.  */
-		  if (GET_CODE (inner) == ZERO_EXTRACT)
-		    /* TBZ/TBNZ.  */
-		    *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
-				       ZERO_EXTRACT, 0, speed);
-		  else
-		    /* CBZ/CBNZ.  */
-		    *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
-
+		  /* TBZ/TBNZ.  */
+		  *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
+				     ZERO_EXTRACT, 0, speed);
 		  return true;
 		}
-	      if (register_operand (inner, VOIDmode)
-		  && aarch64_imm24 (comparator, VOIDmode))
-		{
-		  /* SUB and SUBS.  */
-		  *cost += COSTS_N_INSNS (2);
-		  if (speed)
-		    *cost += extra_cost->alu.arith * 2;
-		  return true;
-		}
-	    }
-	  else if (cmpcode == LT || cmpcode == GE)
-	    {
-	      /* TBZ/TBNZ.  */
-	      if (comparator == const0_rtx)
-		return true;
+	      /* FALLTHRU */
+
+	    case LT:
+	    case GE:
+	      /* CBZ/CBNZ/TBZ/TBNZ.  */
+	      *cost += rtx_cost (inner, cmpmode, cmpcode, 0, speed);
+	      return true;
+
+	    default:
+	      break;
 	    }
 	}
+
+      if ((cmpcode == NE || cmpcode == EQ)
+	  && (cmpmode == SImode || cmpmode == DImode)
+	  && aarch64_split_imm24 (comparator, cmpmode))
+	{
+	  /* SUB and SUBS.  */
+	  *cost += rtx_cost (inner, cmpmode, cmpcode, 0, speed);
+	  *cost += COSTS_N_INSNS (2);
+	  if (speed)
+	    *cost += extra_cost->alu.arith * 2;
+	  return true;
+	}
+
+      if (TARGET_CMPBR)
+	{
+	  *cost += rtx_cost (inner, cmpmode, cmpcode, 0, speed);
+	  if ((cmpmode != SImode && cmpmode != DImode)
+	      || !aarch64_cb_rhs (cmpcode, comparator))
+	    *cost += rtx_cost (comparator, cmpmode, cmpcode, 1, speed);
+	  return true;
+	}
     }
   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
     {
@@ -16945,6 +17057,14 @@ private:
      or vector loop.  There is one entry for each tuning option of
      interest.  */
   auto_vec<aarch64_vec_op_count, 2> m_ops;
+
+  /* When doing inner-loop vectorization the constraints on the data-refs in the
+     outer-loop could limit the inner loop references.  i.e. the outerloop can
+     force the inner-loop to do a load and splat which will result in the loop
+     being entirely scalar as all lanes work on a duplicate.  Currently we don't
+     support unrolling of the inner loop independently from the outerloop during
+     outer-loop vectorization which tends to lead to pipeline bubbles.  */
+  bool m_loop_fully_scalar_dup = false;
 };
 
 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
@@ -17165,8 +17285,8 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
       && STMT_VINFO_DATA_REF (stmt_info))
     {
       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
-      if (stmt_info
-	  && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES)
+      if (node
+	  && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES)
 	return DR_GROUP_SIZE (stmt_info);
     }
   return 0;
@@ -17266,13 +17386,14 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
 
 static bool
 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
-			 unsigned int vec_flags)
+			 slp_tree node, unsigned int vec_flags)
 {
   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
   if (!assign
+      || !node
       || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
-      || !STMT_VINFO_VECTYPE (stmt_info)
-      || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
+      || !SLP_TREE_VECTYPE (node)
+      || !VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
     return false;
 
   for (int i = 1; i < 3; ++i)
@@ -17307,10 +17428,11 @@ aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
    instructions.  */
 static unsigned int
 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
+				       slp_tree node,
 				       stmt_vec_info stmt_info,
 				       const sve_vec_cost *sve_costs)
 {
-  switch (vect_reduc_type (vinfo, stmt_info))
+  switch (vect_reduc_type (vinfo, node))
     {
     case EXTRACT_LAST_REDUCTION:
       return sve_costs->clast_cost;
@@ -17350,7 +17472,9 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
      SVE implementation.  */
 static unsigned int
-aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
+aarch64_in_loop_reduction_latency (vec_info *vinfo,
+				   slp_tree node,
+				   stmt_vec_info stmt_info,
 				   unsigned int vec_flags)
 {
   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
@@ -17363,7 +17487,8 @@ aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
   if (sve_costs)
     {
       unsigned int latency
-	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
+	= aarch64_sve_in_loop_reduction_latency (vinfo, node,
+						 stmt_info, sve_costs);
       if (latency)
 	return latency;
     }
@@ -17437,8 +17562,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
      for each element.  We therefore need to divide the full-instruction
      cost by the number of elements in the vector.  */
   if (kind == scalar_load
+      && node
       && sve_costs
-      && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+      && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
     {
       unsigned int nunits = vect_nunits_for_cost (vectype);
       /* Test for VNx2 modes, which have 64-bit containers.  */
@@ -17450,8 +17576,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
   /* Detect cases in which a scalar_store is really storing one element
      in a scatter operation.  */
   if (kind == scalar_store
+      && node
       && sve_costs
-      && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+      && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
     return sve_costs->scatter_store_elt_cost;
 
   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
@@ -17460,7 +17587,8 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
       && sve_costs)
     {
       unsigned int latency
-	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
+	= aarch64_sve_in_loop_reduction_latency (vinfo, node,
+						 stmt_info, sve_costs);
       if (latency)
 	return latency;
     }
@@ -17609,7 +17737,7 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
 
 	  /* For vector boolean ANDs with a compare operand we just need
 	     one insn.  */
-	  if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
+	  if (aarch64_bool_compound_p (vinfo, stmt_info, node, vec_flags))
 	    return 0;
 	}
 
@@ -17642,13 +17770,12 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
 
    with the single accumulator being read and written multiple times.  */
 static bool
-aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
+aarch64_force_single_cycle (vec_info *vinfo, slp_tree node)
 {
-  if (!STMT_VINFO_REDUC_DEF (stmt_info))
+  auto reduc_info = info_for_reduction (as_a <loop_vec_info> (vinfo), node);
+  if (!reduc_info)
     return false;
-
-  auto reduc_info = info_for_reduction (vinfo, stmt_info);
-  return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
+  return VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
 }
 
 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
@@ -17672,8 +17799,10 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
       && vect_is_reduction (stmt_info))
     {
       unsigned int base
-	= aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
-      if (aarch64_force_single_cycle (m_vinfo, stmt_info))
+	= aarch64_in_loop_reduction_latency (m_vinfo, node,
+					     stmt_info, m_vec_flags);
+      if (m_costing_for_scalar
+	  || aarch64_force_single_cycle (m_vinfo, node))
 	/* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
 	   and then accumulate that, but at the moment the loop-carried
 	   dependency includes all copies.  */
@@ -17690,7 +17819,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
 
       /* Assume that bool AND with compare operands will become a single
 	 operation.  */
-      if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
+      if (aarch64_bool_compound_p (m_vinfo, stmt_info, node, m_vec_flags))
 	return;
     }
 
@@ -17707,7 +17836,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
   if (stmt_info
       && kind == vec_to_scalar
       && (m_vec_flags & VEC_ADVSIMD)
-      && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+      && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
     {
       auto dr = STMT_VINFO_DATA_REF (stmt_info);
       tree dr_ref = DR_REF (dr);
@@ -17720,7 +17849,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
 		{
 		  if (gimple_vuse (SSA_NAME_DEF_STMT (offset)))
 		    {
-		      if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type)
+		      if (SLP_TREE_TYPE (node) == load_vec_info_type)
 			ops->loads += count - 1;
 		      else
 			  /* Stores want to count both the index to array and data to
@@ -17786,7 +17915,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
      have only accounted for one.  */
   if (stmt_info
       && (kind == vector_stmt || kind == vec_to_scalar)
-      && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
+      && vect_reduc_type (m_vinfo, node) == COND_REDUCTION)
     ops->general_ops += count;
 
   /* Count the predicate operations needed by an SVE comparison.  */
@@ -17822,7 +17951,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
   if (stmt_info
       && sve_issue
       && (kind == scalar_load || kind == scalar_store)
-      && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+      && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
     {
       unsigned int pairs = CEIL (count, 2);
       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
@@ -17931,6 +18060,17 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 				     tree vectype, int misalign,
 				     vect_cost_model_location where)
 {
+  /* When costing for scalars, vectype will be NULL; so look up the type via
+     stmt_info's statement.  */
+  if (m_costing_for_scalar && stmt_info)
+    {
+      gcc_assert (!vectype);
+      /* This won't work for e.g. gconds or other statements without a lhs,
+	 but those only work on GPR anyway and this is the best we can do.  */
+      if (tree lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info)))
+	vectype = TREE_TYPE (lhs);
+    }
+
   fractional_cost stmt_cost
     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
 
@@ -17946,6 +18086,28 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	analyze_loop_vinfo (loop_vinfo);
 
       m_analyzed_vinfo = true;
+      if (in_inner_loop_p)
+	m_loop_fully_scalar_dup = true;
+    }
+
+  /* Detect whether the loop is working on fully duplicated lanes.  This would
+     only be possible with inner loop vectorization since otherwise we wouldn't
+     try to vectorize.  */
+  if (in_inner_loop_p
+      && node
+      && m_loop_fully_scalar_dup
+      && SLP_TREE_LANES (node) == 1
+      && !SLP_TREE_CHILDREN (node).exists ())
+    {
+      /* Check if load is a duplicate.  */
+      if (gimple_vuse (stmt_info->stmt)
+	  && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_INVARIANT)
+	;
+      else if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
+	       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
+	;
+      else
+	m_loop_fully_scalar_dup = false;
     }
 
   /* Apply the heuristic described above m_stp_sequence_cost.  */
@@ -17977,9 +18139,10 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 
       /* Check if we've seen an SVE gather/scatter operation and which size.  */
       if (kind == scalar_load
+	  && node
 	  && vectype
 	  && aarch64_sve_mode_p (TYPE_MODE (vectype))
-	  && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+	  && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
 	{
 	  const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
 	  if (sve_costs)
@@ -18311,8 +18474,19 @@ adjust_body_cost (loop_vec_info loop_vinfo,
   if (m_vec_flags & VEC_ANY_SVE)
     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
 
-  if (m_num_vector_iterations >= 1
-      && m_num_vector_iterations < threshold)
+  /* Increase the cost of the vector code if it looks like the vector code has
+     limited throughput due to outer-loop vectorization.  */
+  if (m_loop_fully_scalar_dup)
+    {
+      body_cost *= estimated_vf;
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Increasing body cost to %d because vector code has"
+			 " low throughput of per iteration due to splats\n",
+			 body_cost);
+    }
+  else if (m_num_vector_iterations >= 1
+	   && m_num_vector_iterations < threshold)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
@@ -20481,6 +20655,8 @@ aarch64_compare_version_priority (tree decl1, tree decl2)
      unsigned long _size; // Size of the struct, so it can grow.
      unsigned long _hwcap;
      unsigned long _hwcap2;
+     unsigned long _hwcap3;
+     unsigned long _hwcap4;
    }
  */
 
@@ -20497,14 +20673,24 @@ build_ifunc_arg_type ()
   tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
 			    get_identifier ("_hwcap2"),
 			    long_unsigned_type_node);
+  tree field4 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+			    get_identifier ("_hwcap3"),
+			    long_unsigned_type_node);
+  tree field5 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+			    get_identifier ("_hwcap4"),
+			    long_unsigned_type_node);
 
   DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
   DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
   DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
+  DECL_FIELD_CONTEXT (field4) = ifunc_arg_type;
+  DECL_FIELD_CONTEXT (field5) = ifunc_arg_type;
 
   TYPE_FIELDS (ifunc_arg_type) = field1;
   DECL_CHAIN (field1) = field2;
   DECL_CHAIN (field2) = field3;
+  DECL_CHAIN (field3) = field4;
+  DECL_CHAIN (field4) = field5;
 
   layout_type (ifunc_arg_type);
 
@@ -25366,20 +25552,41 @@ aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
 }
 
+/* Return true if function declaration FNDECL needs to be marked as
+   having a variant PCS.  */
+
+static bool
+aarch64_is_variant_pcs (tree fndecl)
+{
+  /* Check for ABIs that preserve more registers than usual.  */
+  arm_pcs pcs = (arm_pcs) fndecl_abi (fndecl).id ();
+  if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
+    return true;
+
+  /* Check for ABIs that allow PSTATE.SM to be 1 on entry.  */
+  tree fntype = TREE_TYPE (fndecl);
+  if (aarch64_fntype_pstate_sm (fntype) != AARCH64_ISA_MODE_SM_OFF)
+    return true;
+
+  /* Check for ABIs that require PSTATE.ZA to be 1 on entry, either because
+     of ZA or ZT0.  */
+  if (aarch64_fntype_pstate_za (fntype) != 0)
+    return true;
+
+  return false;
+}
+
 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
 
 static void
 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
 {
-  if (TREE_CODE (decl) == FUNCTION_DECL)
+  if (TREE_CODE (decl) == FUNCTION_DECL
+      && aarch64_is_variant_pcs (decl))
     {
-      arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
-      if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
-	{
-	  fprintf (stream, "\t.variant_pcs\t");
-	  assemble_name (stream, name);
-	  fprintf (stream, "\n");
-	}
+      fprintf (stream, "\t.variant_pcs\t");
+      assemble_name (stream, name);
+      fprintf (stream, "\n");
     }
 }
 
@@ -31718,7 +31925,7 @@ aarch64_expand_reversed_crc_using_pmull (scalar_mode crc_mode,
 
 /* Expand the spaceship optab for floating-point operands.
 
-   If the result is compared against (-1, 0, 1 , 2), expand into
+   If the result is compared against (-1, 0, 1, -128), expand into
    fcmpe + conditional branch insns.
 
    Otherwise (the result is just stored as an integer), expand into
@@ -31757,7 +31964,7 @@ aarch64_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx hint)
       emit_jump (end_label);
 
       emit_label (un_label);
-      emit_move_insn (dest, const2_rtx);
+      emit_move_insn (dest, GEN_INT (-128));
       emit_jump (end_label);
 
       emit_label (gt_label);
@@ -31963,9 +32170,43 @@ aarch64_test_sysreg_encoding_clashes (void)
 static void
 aarch64_test_sve_folding ()
 {
+  aarch64_target_switcher switcher (AARCH64_FL_SVE);
+
   tree res = fold_unary (BIT_NOT_EXPR, ssizetype,
 			 ssize_int (poly_int64 (1, 1)));
   ASSERT_TRUE (operand_equal_p (res, ssize_int (poly_int64 (-2, -1))));
+
+  auto build_v16bi = [](bool a, bool b)
+    {
+      rtx_vector_builder builder (VNx16BImode, 2, 1);
+      builder.quick_push (a ? const1_rtx : const0_rtx);
+      builder.quick_push (b ? const1_rtx : const0_rtx);
+      return builder.build ();
+    };
+  rtx v16bi_10 = build_v16bi (1, 0);
+  rtx v16bi_01 = build_v16bi (0, 1);
+
+  for (auto mode : { VNx8BImode, VNx4BImode, VNx2BImode })
+    {
+      rtx reg = gen_rtx_REG (mode, LAST_VIRTUAL_REGISTER + 1);
+      rtx subreg = lowpart_subreg (VNx16BImode, reg, mode);
+      rtx and1 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_10);
+      ASSERT_EQ (lowpart_subreg (mode, and1, VNx16BImode), reg);
+      rtx and0 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_01);
+      ASSERT_EQ (lowpart_subreg (mode, and0, VNx16BImode), CONST0_RTX (mode));
+
+      rtx ior1 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_10);
+      ASSERT_EQ (lowpart_subreg (mode, ior1, VNx16BImode), CONSTM1_RTX (mode));
+      rtx ior0 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_01);
+      ASSERT_EQ (lowpart_subreg (mode, ior0, VNx16BImode), reg);
+
+      rtx xor1 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_10);
+      ASSERT_RTX_EQ (lowpart_subreg (mode, xor1, VNx16BImode),
+		     lowpart_subreg (mode, gen_rtx_NOT (VNx16BImode, subreg),
+				     VNx16BImode));
+      rtx xor0 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_01);
+      ASSERT_EQ (lowpart_subreg (mode, xor0, VNx16BImode), reg);
+    }
 }
 
 /* Run all target-specific selftests.  */
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 096c853..2b3610c 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -410,8 +410,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
 /* CSSC instructions are enabled through +cssc.  */
 #define TARGET_CSSC AARCH64_HAVE_ISA (CSSC)
 
-/* CB<cc> instructions are enabled through +cmpbr.  */
-#define TARGET_CMPBR AARCH64_HAVE_ISA (CMPBR)
+/* CB<cc> instructions are enabled through +cmpbr,
+   but are incompatible with -mtrack-speculation. */
+#define TARGET_CMPBR (AARCH64_HAVE_ISA (CMPBR) && !aarch64_track_speculation)
 
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a4ae685..6e215c4 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -280,6 +280,7 @@
     UNSPEC_PACIBSP
     UNSPEC_PRLG_STK
     UNSPEC_REV
+    UNSPEC_REV_PRED
     UNSPEC_SADALP
     UNSPEC_SCVTF
     UNSPEC_SET_LANE
@@ -440,6 +441,16 @@
    ; must not operate on inactive inputs if doing so could induce a fault.
    (SVE_STRICT_GP 1)])
 
+;; These constants are used as a const_int in MTE instructions
+(define_constants
+  [; 0xf0ff...
+   ; Tag mask for the 4-bit tag stored in the top 8 bits of a pointer.
+   (MEMTAG_TAG_MASK -1080863910568919041)
+
+   ;  0x00ff...
+   ; Tag mask 56-bit address used by subp instruction.
+   (MEMTAG_ADDR_MASK 72057594037927935)])
+
 (include "constraints.md")
 (include "predicates.md")
 (include "iterators.md")
@@ -724,8 +735,8 @@
     (BRANCH_LEN_N_32KiB -32768)
 
     ;; +/- 1KiB.  Used by CBB<cond>, CBH<cond>, CB<cond>.
-    (BRANCH_LEN_P_1Kib  1020)
-    (BRANCH_LEN_N_1Kib -1024)
+    (BRANCH_LEN_P_1KiB  1020)
+    (BRANCH_LEN_N_1KiB -1024)
   ]
 )
 
@@ -803,7 +814,7 @@
 )
 
 ;; For an EQ/NE comparison against zero, emit `CBZ`/`CBNZ`
-(define_insn "aarch64_cbz<optab><mode>1"
+(define_insn "*aarch64_cbz<optab><mode>"
   [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
 				(const_int 0))
 			   (label_ref (match_operand 1))
@@ -837,27 +848,13 @@
   [(set (pc) (if_then_else (LTGE (match_operand:ALLI 0 "register_operand" "r")
 				 (const_int 0))
 			   (label_ref (match_operand 1))
-			   (pc)))
-   (clobber (reg:CC CC_REGNUM))]
+			   (pc)))]
   "!aarch64_track_speculation"
   {
-    if (get_attr_length (insn) == 8)
-      {
-	if (get_attr_far_branch (insn) == FAR_BRANCH_YES)
-	  return aarch64_gen_far_branch (operands, 1, "Ltb",
-					 "<inv_tb>\\t%<w>0, <sizem1>, ");
-	else
-	  {
-	    char buf[64];
-	    uint64_t val = ((uint64_t) 1)
-		<< (GET_MODE_SIZE (<MODE>mode) * BITS_PER_UNIT - 1);
-	    sprintf (buf, "tst\t%%<w>0, %" PRId64, val);
-	    output_asm_insn (buf, operands);
-	    return "<bcond>\t%l1";
-	  }
-      }
-    else
+    if (get_attr_length (insn) == 4)
       return "<tbz>\t%<w>0, <sizem1>, %l1";
+    return aarch64_gen_far_branch (operands, 1, "Ltb",
+				   "<inv_tb>\\t%<w>0, <sizem1>, ");
   }
   [(set_attr "type" "branch")
    (set (attr "length")
@@ -869,44 +866,44 @@
 		      (const_int 8)))
    (set (attr "far_branch")
 	(if_then_else (and (ge (minus (match_dup 1) (pc))
-			       (const_int BRANCH_LEN_N_1MiB))
+			       (const_int BRANCH_LEN_N_32KiB))
 			   (lt (minus (match_dup 1) (pc))
-			       (const_int BRANCH_LEN_P_1MiB)))
+			       (const_int BRANCH_LEN_P_32KiB)))
 		      (const_string "no")
 		      (const_string "yes")))]
 )
 
 ;; Emit a `CB<cond> (register)` or `CB<cond> (immediate)` instruction.
 ;; The immediate range depends on the comparison code.
-;; Comparisons against immediates outside this range fall back to
-;; CMP + B<cond>.
-(define_insn "aarch64_cb<INT_CMP:code><GPI:mode>"
-  [(set (pc) (if_then_else (INT_CMP
-			     (match_operand:GPI 0 "register_operand" "r")
-			     (match_operand:GPI 1 "nonmemory_operand"
-			       "r<INT_CMP:cmpbr_imm_constraint>"))
-			   (label_ref (match_operand 2))
-			   (pc)))]
-  "TARGET_CMPBR && aarch64_cb_rhs (<INT_CMP:CODE>, operands[1])"
+(define_insn "*aarch64_cb<code><mode>"
+  [(set (pc) (if_then_else
+		(INT_CMP
+		  (match_operand:GPI 0 "register_operand" "r")
+		  (match_operand:GPI 1
+		    "aarch64_reg_<cmpbr_imm_constraint>_operand"
+		    "r<cmpbr_imm_constraint>"))
+		(label_ref (match_operand 2))
+		(pc)))]
+  "TARGET_CMPBR"
   {
-    return (get_attr_far_branch (insn) == FAR_BRANCH_NO)
-      ? "cb<INT_CMP:cmp_op>\\t%<w>0, %<w>1, %l2"
-      : aarch64_gen_far_branch (operands, 2, "L",
-          "cb<INT_CMP:inv_cmp_op>\\t%<w>0, %<w>1, ");
+    if (get_attr_length (insn) == 4)
+      return "cb<cmp_op>\t%<w>0, %<w>1, %l2";
+    return aarch64_gen_far_branch (operands, 2, "L",
+		"cb<inv_cmp_op>\t%<w>0, %<w>1, ");
   }
   [(set_attr "type" "branch")
    (set (attr "length")
 	(if_then_else (and (ge (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_N_1Kib))
+			       (const_int BRANCH_LEN_N_1KiB))
 			   (lt (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_P_1Kib)))
+			       (const_int BRANCH_LEN_P_1KiB)))
 		      (const_int 4)
 		      (const_int 8)))
    (set (attr "far_branch")
 	(if_then_else (and (ge (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_N_1Kib))
+			       (const_int BRANCH_LEN_N_1KiB))
 			   (lt (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_P_1Kib)))
+			       (const_int BRANCH_LEN_P_1KiB)))
 		      (const_string "no")
 		      (const_string "yes")))]
 )
@@ -928,16 +925,16 @@
   [(set_attr "type" "branch")
    (set (attr "length")
 	(if_then_else (and (ge (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_N_1Kib))
+			       (const_int BRANCH_LEN_N_1KiB))
 			   (lt (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_P_1Kib)))
+			       (const_int BRANCH_LEN_P_1KiB)))
 		      (const_int 4)
 		      (const_int 8)))
    (set (attr "far_branch")
 	(if_then_else (and (ge (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_N_1Kib))
+			       (const_int BRANCH_LEN_N_1KiB))
 			   (lt (minus (match_dup 2) (pc))
-			       (const_int BRANCH_LEN_P_1Kib)))
+			       (const_int BRANCH_LEN_P_1KiB)))
 		      (const_string "no")
 		      (const_string "yes")))]
 )
@@ -977,37 +974,24 @@
 		      (const_string "yes")))]
 )
 
-;; For a 24-bit immediate CST we can optimize the compare for equality
-;; and branch sequence from:
-;; 	mov	x0, #imm1
-;; 	movk	x0, #imm2, lsl 16 /* x0 contains CST.  */
-;; 	cmp	x1, x0
-;; 	b<ne,eq> .Label
-;; into the shorter:
-;; 	sub	x0, x1, #(CST & 0xfff000)
-;; 	subs	x0, x0, #(CST & 0x000fff)
-;; 	b<ne,eq> .Label
+;; For a 24-bit immediate CST we can optimize the compare for equality.
 (define_insn_and_split "*aarch64_bcond_wide_imm<GPI:mode>"
-  [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
-			        (match_operand:GPI 1 "aarch64_imm24" "n"))
-			   (label_ref:P (match_operand 2))
-			   (pc)))]
-  "!aarch64_move_imm (INTVAL (operands[1]), <GPI:MODE>mode)
-   && !aarch64_plus_operand (operands[1], <GPI:MODE>mode)
-   && !reload_completed"
+  [(set (pc) (if_then_else
+	       (match_operator 0 "aarch64_equality_operator"
+		[(match_operand:GPI 1 "register_operand" "r")
+	         (match_operand:GPI 2 "aarch64_split_imm24" "n")])
+	       (label_ref (match_operand 3))
+	       (pc)))
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (match_scratch:GPI 4 "=r"))]
+  ""
   "#"
-  "&& true"
+  ""
   [(const_int 0)]
   {
-    HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff;
-    HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000;
-    rtx tmp = gen_reg_rtx (<GPI:MODE>mode);
-    emit_insn (gen_add<GPI:mode>3 (tmp, operands[0], GEN_INT (-hi_imm)));
-    emit_insn (gen_add<GPI:mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
-    rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
-    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <GPI:MODE>mode,
-				  cc_reg, const0_rtx);
-    emit_jump_insn (gen_aarch64_bcond (cmp_rtx, cc_reg, operands[2]));
+    rtx cc_reg = aarch64_gen_compare_split_imm24 (operands[1], operands[2],
+						  operands[4]);
+    emit_jump_insn (gen_aarch64_bcond (operands[0], cc_reg, operands[3]));
     DONE;
   }
 )
@@ -1412,16 +1396,16 @@
       /* Save GCS with code like
 		mov     x16, 1
 		chkfeat x16
-		tbnz    x16, 0, .L_done
+		cbnz    x16, .L_done
 		mrs     tmp, gcspr_el0
 		str     tmp, [%0, 8]
 	.L_done:  */
 
-      rtx done_label = gen_label_rtx ();
+      auto done_label = gen_label_rtx ();
       rtx r16 = gen_rtx_REG (DImode, R16_REGNUM);
       emit_move_insn (r16, const1_rtx);
       emit_insn (gen_aarch64_chkfeat ());
-      emit_insn (gen_tbranch_neqi3 (r16, const0_rtx, done_label));
+      emit_jump_insn (aarch64_gen_compare_zero_and_branch (NE, r16, done_label));
       rtx gcs_slot = adjust_address (operands[0], Pmode, GET_MODE_SIZE (Pmode));
       rtx gcs = gen_reg_rtx (Pmode);
       emit_insn (gen_aarch64_load_gcspr (gcs));
@@ -1444,7 +1428,7 @@
       /* Restore GCS with code like
 		mov     x16, 1
 		chkfeat x16
-		tbnz    x16, 0, .L_done
+		cbnz    x16, .L_done
 		ldr     tmp1, [%1, 8]
 		mrs     tmp2, gcspr_el0
 		subs    tmp2, tmp1, tmp2
@@ -1455,12 +1439,12 @@
 		b.ne    .L_loop
 	.L_done:  */
 
-      rtx loop_label = gen_label_rtx ();
-      rtx done_label = gen_label_rtx ();
+      auto loop_label = gen_label_rtx ();
+      auto done_label = gen_label_rtx ();
       rtx r16 = gen_rtx_REG (DImode, R16_REGNUM);
       emit_move_insn (r16, const1_rtx);
       emit_insn (gen_aarch64_chkfeat ());
-      emit_insn (gen_tbranch_neqi3 (r16, const0_rtx, done_label));
+      emit_jump_insn (aarch64_gen_compare_zero_and_branch (NE, r16, done_label));
       rtx gcs_slot = adjust_address (operands[1], Pmode, GET_MODE_SIZE (Pmode));
       rtx gcs_old = gen_reg_rtx (Pmode);
       emit_move_insn (gcs_old, gcs_slot);
@@ -4523,7 +4507,7 @@
   [(set_attr "type" "fcmp<stype>")]
 )
 
-(define_insn "*cmp_swp_<shift>_reg<mode>"
+(define_insn "cmp_swp_<shift>_reg<mode>"
   [(set (reg:CC_SWP CC_REGNUM)
 	(compare:CC_SWP (ASHIFT:GPI
 			 (match_operand:GPI 0 "register_operand" "r")
@@ -4650,39 +4634,24 @@
   [(set_attr "type" "csel")]
 )
 
-;; For a 24-bit immediate CST we can optimize the compare for equality
-;; and branch sequence from:
-;; 	mov	x0, #imm1
-;; 	movk	x0, #imm2, lsl 16 /* x0 contains CST.  */
-;; 	cmp	x1, x0
-;; 	cset	x2, <ne,eq>
-;; into the shorter:
-;; 	sub	x0, x1, #(CST & 0xfff000)
-;; 	subs	x0, x0, #(CST & 0x000fff)
-;; 	cset x2, <ne, eq>.
+;; For a 24-bit immediate CST we can optimize the compare for equality.
 (define_insn_and_split "*compare_cstore<mode>_insn"
   [(set (match_operand:GPI 0 "register_operand" "=r")
-	 (EQL:GPI (match_operand:GPI 1 "register_operand" "r")
-		  (match_operand:GPI 2 "aarch64_imm24" "n")))
-   (clobber (reg:CC CC_REGNUM))]
-  "!aarch64_move_imm (INTVAL (operands[2]), <MODE>mode)
-   && !aarch64_plus_operand (operands[2], <MODE>mode)
-   && !reload_completed"
+	(match_operator:GPI 1 "aarch64_equality_operator"
+	 [(match_operand:GPI 2 "register_operand" "r")
+	  (match_operand:GPI 3 "aarch64_split_imm24" "n")]))
+   (clobber (reg:CC CC_REGNUM))
+   (clobber (match_scratch:GPI 4 "=r"))]
+  ""
   "#"
-  "&& true"
+  ""
   [(const_int 0)]
   {
-    HOST_WIDE_INT lo_imm = UINTVAL (operands[2]) & 0xfff;
-    HOST_WIDE_INT hi_imm = UINTVAL (operands[2]) & 0xfff000;
-    rtx tmp = gen_reg_rtx (<MODE>mode);
-    emit_insn (gen_add<mode>3 (tmp, operands[1], GEN_INT (-hi_imm)));
-    emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
-    rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
-    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx);
-    emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp_rtx, cc_reg));
+    rtx cc_reg = aarch64_gen_compare_split_imm24 (operands[2], operands[3],
+						  operands[4]);
+    emit_insn (gen_aarch64_cstore<mode> (operands[0], operands[1], cc_reg));
     DONE;
   }
-  [(set_attr "type" "csel")]
 )
 
 ;; zero_extend version of the above
@@ -4812,15 +4781,21 @@
 			   (match_operand:ALLI 3 "register_operand")))]
   ""
   {
-    rtx ccreg;
     enum rtx_code code = GET_CODE (operands[1]);
-
     if (code == UNEQ || code == LTGT)
       FAIL;
 
-    ccreg = aarch64_gen_compare_reg (code, XEXP (operands[1], 0),
-				     XEXP (operands[1], 1));
-    operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx);
+    rtx ccreg = XEXP (operands[1], 0);
+    enum machine_mode ccmode = GET_MODE (ccreg);
+    if (GET_MODE_CLASS (ccmode) == MODE_CC)
+      gcc_assert (XEXP (operands[1], 1) == const0_rtx);
+    else if (ccmode == QImode || ccmode == HImode)
+      FAIL;
+    else
+      {
+	ccreg = aarch64_gen_compare_reg (code, ccreg, XEXP (operands[1], 1));
+	operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx);
+      }
   }
 )
 
@@ -7715,6 +7690,22 @@
 }
 )
 
+(define_expand "isinf<mode>2"
+ [(match_operand:SI 0 "register_operand")
+  (match_operand:GPF 1 "register_operand")]
+ "TARGET_FLOAT"
+{
+  rtx op = force_lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
+  rtx tmp = gen_reg_rtx (<V_INT_EQUIV>mode);
+  emit_move_insn (tmp, GEN_INT (HOST_WIDE_INT_M1U << (<mantissa_bits> + 1)));
+  rtx cc_reg = gen_rtx_REG (CC_SWPmode, CC_REGNUM);
+  emit_insn (gen_cmp_swp_lsl_reg<v_int_equiv> (op, GEN_INT (1), tmp));
+  rtx cmp = gen_rtx_fmt_ee (EQ, SImode, cc_reg, const0_rtx);
+  emit_insn (gen_aarch64_cstoresi (operands[0], cmp, cc_reg));
+  DONE;
+}
+)
+
 ;; -------------------------------------------------------------------
 ;; Reload support
 ;; -------------------------------------------------------------------
@@ -8565,7 +8556,7 @@
   [(set (match_operand:DI 0 "register_operand" "=rk")
 	(ior:DI
 	 (and:DI (match_operand:DI 1 "register_operand" "rk")
-		 (const_int -1080863910568919041)) ;; 0xf0ff...
+		 (const_int MEMTAG_TAG_MASK))
 	 (ashift:DI (unspec:QI [(match_operand:DI 2 "register_operand" "r")]
 		     UNSPEC_GEN_TAG_RND)
 		    (const_int 56))))]
@@ -8608,9 +8599,9 @@
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(minus:DI
 	  (and:DI (match_operand:DI 1 "register_operand" "rk")
-		  (const_int 72057594037927935)) ;; 0x00ff...
+		  (const_int MEMTAG_ADDR_MASK))
 	  (and:DI (match_operand:DI 2 "register_operand" "rk")
-		  (const_int 72057594037927935))))] ;; 0x00ff...
+		  (const_int MEMTAG_ADDR_MASK))))]
   "TARGET_MEMTAG"
   "subp\\t%0, %1, %2"
   [(set_attr "type" "memtag")]
@@ -8620,7 +8611,7 @@
 (define_insn "ldg"
   [(set (match_operand:DI 0 "register_operand" "+r")
 	(ior:DI
-	 (and:DI (match_dup 0) (const_int -1080863910568919041)) ;; 0xf0ff...
+	 (and:DI (match_dup 0) (const_int MEMTAG_TAG_MASK))
 	 (ashift:DI
 	  (mem:QI (unspec:DI
 	   [(and:DI (plus:DI (match_operand:DI 1 "register_operand" "rk")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index dc1925d..7b9e558 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -312,15 +312,9 @@
 
 (define_constraint "Uc1"
   "@internal
-  A constraint that matches the integers 1...64."
+  A constraint that matches the integers 0...62."
   (and (match_code "const_int")
-       (match_test "IN_RANGE (ival, 1, 64)")))
-
-(define_constraint "Uc2"
-  "@internal
-  A constraint that matches the integers -1...62."
-  (and (match_code "const_int")
-       (match_test "IN_RANGE (ival, -1, 62)")))
+       (match_test "IN_RANGE (ival, 0, 62)")))
 
 (define_constraint "Up3"
   "@internal
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 8533912..b15e578 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -455,6 +455,7 @@
 (define_mode_iterator VCVTFPM [V4HF V8HF V4SF])
 
 ;; Iterators for single modes, for "@" patterns.
+(define_mode_iterator VNx16BI_ONLY [VNx16BI])
 (define_mode_iterator VNx16QI_ONLY [VNx16QI])
 (define_mode_iterator VNx16SI_ONLY [VNx16SI])
 (define_mode_iterator VNx8HI_ONLY [VNx8HI])
@@ -542,6 +543,12 @@
 ;; elements.
 (define_mode_iterator SVE_FULL_HSF [VNx8HF VNx4SF])
 
+;; Like SVE_FULL_HSF, but selectively enables those modes that are valid
+;; for the variant of the SVE2 FP8 FDOT instruction associated with that
+;; mode.
+(define_mode_iterator SVE_FULL_HSF_FP8_FDOT [(VNx4SF "TARGET_SSVE_FP8DOT4")
+					     (VNx8HF "TARGET_SSVE_FP8DOT2")])
+
 ;; Partial SVE floating-point vector modes that have 16-bit or 32-bit
 ;; elements.
 (define_mode_iterator SVE_PARTIAL_HSF [VNx2HF VNx4HF VNx2SF])
@@ -930,7 +937,6 @@
     UNSPEC_UZP2Q	; Used in aarch64-sve.md.
     UNSPEC_ZIP1Q	; Used in aarch64-sve.md.
     UNSPEC_ZIP2Q	; Used in aarch64-sve.md.
-    UNSPEC_TRN1_CONV	; Used in aarch64-sve.md.
     UNSPEC_COND_CMPEQ_WIDE ; Used in aarch64-sve.md.
     UNSPEC_COND_CMPGE_WIDE ; Used in aarch64-sve.md.
     UNSPEC_COND_CMPGT_WIDE ; Used in aarch64-sve.md.
@@ -1185,6 +1191,9 @@
     UNSPEC_LUTI2	; Used in aarch64-simd.md.
     UNSPEC_LUTI4	; Used in aarch64-simd.md.
 
+    ;; All used in aarch64-sve.md
+    UNSPEC_PERMUTE_PRED
+
     ;; All used in aarch64-sve2.md
     UNSPEC_ADDQV
     UNSPEC_ANDQV
@@ -1331,6 +1340,8 @@
 
 (define_mode_attr half_mask [(HI "255") (SI "65535") (DI "4294967295")])
 
+(define_mode_attr mantissa_bits [(SF "23") (DF "52")])
+
 ;; For constraints used in scalar immediate vector moves
 (define_mode_attr hq [(HI "h") (QI "q")])
 
@@ -2977,19 +2988,15 @@
 
 (define_code_iterator INT_CMP [lt le eq ne ge gt ltu leu geu gtu])
 
+;; Inverse comparisons must have the same constraint so that
+;; branches can be redirected during late compilation.
 (define_code_attr cmpbr_imm_constraint [
-    (eq "Uc0")
-    (ne "Uc0")
-    (gt "Uc0")
-    (gtu "Uc0")
-    (lt "Uc0")
-    (ltu "Uc0")
-
-    (ge "Uc1")
-    (geu "Uc1")
-
-    (le "Uc2")
-    (leu "Uc2")
+    (eq "Uc0") (ne "Uc0")
+    (lt "Uc0") (ge "Uc0")
+    (ltu "Uc0") (geu "Uc0")
+
+    (gt "Uc1") (le "Uc1")
+    (gtu "Uc1") (leu "Uc1")
 ])
 
 (define_code_attr fix_trunc_optab [(fix "fix_trunc")
@@ -3877,6 +3884,8 @@
 
 (define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])
 
+(define_int_iterator PNEXT_ONLY [UNSPEC_PNEXT])
+
 (define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL
 			     UNSPEC_USMATMUL])
 
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 32056da..42304ce 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -286,10 +286,15 @@
   (and (match_code "const_int")
        (match_test "UINTVAL (op) <= 7")))
 
-;; An immediate that fits into 24 bits.
-(define_predicate "aarch64_imm24"
-  (and (match_code "const_int")
-       (match_test "IN_RANGE (UINTVAL (op), 0, 0xffffff)")))
+;; An immediate that fits into 24 bits, but needs splitting.
+(define_predicate "aarch64_split_imm24"
+  (match_code "const_int")
+{
+  unsigned HOST_WIDE_INT i = UINTVAL (op);
+  return (IN_RANGE (i, 0, 0xffffff)
+          && !aarch64_move_imm (i, mode)
+          && !aarch64_uimm12_shift (i));
+})
 
 (define_predicate "aarch64_mem_pair_offset"
   (and (match_code "const_int")
@@ -1078,3 +1083,19 @@
 (define_predicate "aarch64_maskload_else_operand"
   (and (match_code "const_vector")
        (match_test "op == CONST0_RTX (GET_MODE (op))")))
+
+;; Check for a VNx16BI predicate that is a canonical PTRUE for the given
+;; predicate mode.
+(define_special_predicate "aarch64_ptrue_all_operand"
+  (and (match_code "const_vector")
+       (match_test "aarch64_ptrue_all_mode (op) == mode")))
+
+(define_predicate "aarch64_reg_Uc0_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "const_int")
+	    (match_test "satisfies_constraint_Uc0 (op)"))))
+
+(define_predicate "aarch64_reg_Uc1_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "const_int")
+	    (match_test "satisfies_constraint_Uc1 (op)"))))
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index 38a8c06..63ca8e9 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -190,12 +190,6 @@ aarch-bti-insert.o: $(srcdir)/config/arm/aarch-bti-insert.cc \
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/arm/aarch-bti-insert.cc
 
-aarch64-cc-fusion.o: $(srcdir)/config/aarch64/aarch64-cc-fusion.cc \
-    $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \
-    $(RTL_SSA_H) tree-pass.h
-	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
-		$(srcdir)/config/aarch64/aarch64-cc-fusion.cc
-
 aarch64-early-ra.o: $(srcdir)/config/aarch64/aarch64-early-ra.cc \
     $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \
     $(RTL_SSA_H) tree-pass.h
diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
index f76a250..9eb1a20 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
@@ -26,7 +26,7 @@
 static const struct cpu_addrcost_table generic_armv9_a_addrcost_table =
 {
     {
-      1, /* hi  */
+      0, /* hi  */
       0, /* si  */
       0, /* di  */
       1, /* ti  */
diff --git a/gcc/config/aarch64/tuning_models/olympus.h b/gcc/config/aarch64/tuning_models/olympus.h
new file mode 100644
index 0000000..268789d
--- /dev/null
+++ b/gcc/config/aarch64/tuning_models/olympus.h
@@ -0,0 +1,210 @@
+/* Tuning model description for the NVIDIA Olympus core.
+   Copyright The GNU Toolchain Authors.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_H_OLYMPUS
+#define GCC_AARCH64_H_OLYMPUS
+
+#include "generic.h"
+
+static struct cpu_regmove_cost olympus_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Spilling to int<->fp instead of memory is recommended so set
+     realistic costs compared to memmov_cost.  */
+  3, /* GP2FP  */
+  3, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static advsimd_vec_cost olympus_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  2, /* ld2_st2_permute_cost */
+  2, /* ld3_st3_permute_cost  */
+  3, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  5, /* reduc_i8_cost  */
+  3, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  2, /* reduc_i64_cost  */
+  4, /* reduc_f16_cost  */
+  4, /* reduc_f32_cost  */
+  4, /* reduc_f64_cost  */
+  2, /* store_elt_extra_cost  */
+  8, /* vec_to_scalar_cost  */
+  4, /* scalar_to_vec_cost  */
+  6, /* align_load_cost  */
+  6, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static sve_vec_cost olympus_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    2, /* ld2_st2_permute_cost  */
+    3, /* ld3_st3_permute_cost  */
+    3, /* ld4_st4_permute_cost  */
+    2, /* permute_cost  */
+    9, /* reduc_i8_cost  */
+    8, /* reduc_i16_cost  */
+    6, /* reduc_i32_cost  */
+    2, /* reduc_i64_cost  */
+    8, /* reduc_f16_cost  */
+    6, /* reduc_f32_cost  */
+    4, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    8, /* vec_to_scalar_cost  */
+    4, /* scalar_to_vec_cost  */
+    4, /* align_load_cost  */
+    6, /* unalign_load_cost  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  10, /* fadda_f16_cost  */
+  6, /* fadda_f32_cost  */
+  4, /* fadda_f64_cost  */
+  14, /* gather_load_x32_cost  */
+  12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
+  1 /* scatter_store_elt_cost  */
+};
+
+static aarch64_scalar_vec_issue_info olympus_scalar_issue_info =
+{
+  4, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  8, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static aarch64_advsimd_vec_issue_info olympus_advsimd_issue_info =
+{
+  {
+    3, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    6, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static aarch64_sve_vec_issue_info olympus_sve_issue_info =
+{
+  {
+    {
+      3, /* loads_stores_per_cycle  */
+      2, /* stores_per_cycle  */
+      6, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    2, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  2, /* pred_ops_per_cycle  */
+  1, /* while_pred_ops  */
+  0, /* int_cmp_pred_ops  */
+  0, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static aarch64_vec_issue_info olympus_vec_issue_info =
+{
+  &olympus_scalar_issue_info,
+  &olympus_advsimd_issue_info,
+  &olympus_sve_issue_info
+};
+
+/* Olympus costs for vector insn classes.  */
+static struct cpu_vector_cost olympus_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &olympus_advsimd_vector_cost, /* advsimd  */
+  &olympus_sve_vector_cost, /* sve  */
+  &olympus_vec_issue_info /* issue_info  */
+};
+
+/* Olympus prefetch settings (which disable prefetch).  */
+static cpu_prefetch_tune olympus_prefetch_tune =
+{
+  0,			/* num_slots  */
+  -1,			/* l1_cache_size  */
+  64,			/* l1_cache_line_size  */
+  -1,			/* l2_cache_size  */
+  true,			/* prefetch_dynamic_strides */
+  -1,			/* minimum_stride */
+  -1			/* default_opt_level  */
+};
+
+static struct tune_params olympus_tunings =
+{
+  &cortexa76_extra_costs,
+  &generic_armv9_a_addrcost_table,
+  &olympus_regmove_cost,
+  &olympus_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_128, /* sve_width  */
+  { 4, /* load_int.  */
+    1, /* store_int.  */
+    6, /* load_fp.  */
+    3, /* store_fp.  */
+    5, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
+  10, /* issue_rate  */
+  AARCH64_FUSE_NEOVERSE_BASE, /* fusible_ops  */
+  "32:16",	/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  8,	/* int_reassoc_width.  */
+  6,	/* fp_reassoc_width.  */
+  4,	/* fma_reassoc_width.  */
+  6,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_BASE
+   | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),	/* tune_flags.  */
+  &olympus_prefetch_tune,
+  AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
+  AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
+};
+
+#endif /* GCC_AARCH64_H_OLYMPUS.  */
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index d119464..8f7e537 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -66,9 +66,9 @@
 ;;   I signed 12-bit immediate (for ARCompact)
 ;;   K  unsigned 3-bit immediate (for ARCompact)
 ;;   L  unsigned 6-bit immediate (for ARCompact)
-;;   M  unsinged 5-bit immediate (for ARCompact)
-;;   O  unsinged 7-bit immediate (for ARCompact)
-;;   P  unsinged 8-bit immediate (for ARCompact)
+;;   M  unsigned 5-bit immediate (for ARCompact)
+;;   O  unsigned 7-bit immediate (for ARCompact)
+;;   P  unsigned 8-bit immediate (for ARCompact)
 ;;   N  constant '1' (for ARCompact)
 
 
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 29b45ae..8b951f3 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -18983,7 +18983,8 @@ cmse_nonsecure_call_inline_register_clear (void)
 	      call = SET_SRC (call);
 
 	  /* Check if it is a cmse_nonsecure_call.  */
-	  unspec = XEXP (call, 0);
+	  unspec = XVECEXP (pat, 0, 2);
+
 	  if (GET_CODE (unspec) != UNSPEC
 	      || XINT (unspec, 1) != UNSPEC_NONSECURE_MEM)
 	    continue;
@@ -19010,7 +19011,7 @@ cmse_nonsecure_call_inline_register_clear (void)
 
 	  /* Make sure the register used to hold the function address is not
 	     cleared.  */
-	  address = RTVEC_ELT (XVEC (unspec, 0), 0);
+	  address = XEXP (call, 0);
 	  gcc_assert (MEM_P (address));
 	  gcc_assert (REG_P (XEXP (address, 0)));
 	  address_regnum = REGNO (XEXP (address, 0));
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 5e5e112..422ae54 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -8623,7 +8623,7 @@
     if (detect_cmse_nonsecure_call (addr))
       {
 	pat = gen_nonsecure_call_internal (operands[0], operands[1],
-					   operands[2]);
+					   operands[2], const0_rtx);
 	emit_call_insn (pat);
       }
     else
@@ -8665,10 +8665,10 @@
 	      (clobber (reg:SI LR_REGNUM))])])
 
 (define_expand "nonsecure_call_internal"
-  [(parallel [(call (unspec:SI [(match_operand 0 "memory_operand")]
-			       UNSPEC_NONSECURE_MEM)
+  [(parallel [(call (match_operand 0 "memory_operand")
 		    (match_operand 1 "general_operand"))
 	      (use (match_operand 2 "" ""))
+	      (unspec:SI [(match_operand 3)] UNSPEC_NONSECURE_MEM)
 	      (clobber (reg:SI LR_REGNUM))])]
   "use_cmse"
   {
@@ -8745,7 +8745,8 @@
     if (detect_cmse_nonsecure_call (addr))
       {
 	pat = gen_nonsecure_call_value_internal (operands[0], operands[1],
-						 operands[2], operands[3]);
+						 operands[2], operands[3],
+						 const0_rtx);
 	emit_call_insn (pat);
       }
     else
@@ -8779,10 +8780,10 @@
 
 (define_expand "nonsecure_call_value_internal"
   [(parallel [(set (match_operand       0 "" "")
-		   (call (unspec:SI [(match_operand 1 "memory_operand")]
-				    UNSPEC_NONSECURE_MEM)
+		   (call (match_operand 1 "memory_operand")
 			 (match_operand 2 "general_operand")))
 	      (use (match_operand 3 "" ""))
+	      (unspec:SI [(match_operand 4)] UNSPEC_NONSECURE_MEM)
 	      (clobber (reg:SI LR_REGNUM))])]
   "use_cmse"
   "
@@ -13025,7 +13026,7 @@
   "arm_coproc_builtin_available (VUNSPEC_<MCRR>)"
 {
   arm_const_bounds (operands[0], 0, 16);
-  arm_const_bounds (operands[1], 0, 8);
+  arm_const_bounds (operands[1], 0, 16);
   arm_const_bounds (operands[3], 0, (1 << 5));
   return "<mcrr>\\tp%c0, %1, %Q2, %R2, CR%c3";
 }
@@ -13040,7 +13041,7 @@
   "arm_coproc_builtin_available (VUNSPEC_<MRRC>)"
 {
   arm_const_bounds (operands[1], 0, 16);
-  arm_const_bounds (operands[2], 0, 8);
+  arm_const_bounds (operands[2], 0, 16);
   arm_const_bounds (operands[3], 0, (1 << 5));
   return "<mrrc>\\tp%c1, %2, %Q0, %R0, CR%c3";
 }
diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
index f9e89e9..4da0086 100644
--- a/gcc/config/arm/thumb1.md
+++ b/gcc/config/arm/thumb1.md
@@ -1874,10 +1874,10 @@
 )
 
 (define_insn "*nonsecure_call_reg_thumb1_v5"
-  [(call (unspec:SI [(mem:SI (reg:SI R4_REGNUM))]
-		    UNSPEC_NONSECURE_MEM)
+  [(call (mem:SI (reg:SI R4_REGNUM))
 	 (match_operand 0 "" ""))
    (use (match_operand 1 "" ""))
+   (unspec:SI [(match_operand 2)]UNSPEC_NONSECURE_MEM)
    (clobber (reg:SI LR_REGNUM))]
   "TARGET_THUMB1 && use_cmse && !SIBLING_CALL_P (insn)"
   "bl\\t__gnu_cmse_nonsecure_call"
@@ -1919,11 +1919,10 @@
 
 (define_insn "*nonsecure_call_value_reg_thumb1_v5"
   [(set (match_operand 0 "" "")
-	(call (unspec:SI
-	       [(mem:SI (reg:SI R4_REGNUM))]
-	       UNSPEC_NONSECURE_MEM)
+	(call (mem:SI (reg:SI R4_REGNUM))
 	      (match_operand 1 "" "")))
    (use (match_operand 2 "" ""))
+   (unspec:SI [(match_operand 3)] UNSPEC_NONSECURE_MEM)
    (clobber (reg:SI LR_REGNUM))]
   "TARGET_THUMB1 && use_cmse"
   "bl\\t__gnu_cmse_nonsecure_call"
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index 019f9d4..2c2026b 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -537,10 +537,10 @@
 )
 
 (define_insn "*nonsecure_call_reg_thumb2_fpcxt"
-  [(call (unspec:SI [(mem:SI (match_operand:SI 0 "s_register_operand" "l*r"))]
-		    UNSPEC_NONSECURE_MEM)
+  [(call (mem:SI (match_operand:SI 0 "s_register_operand" "l*r"))
 	 (match_operand 1 "" ""))
    (use (match_operand 2 "" ""))
+   (unspec:SI [(match_operand 3)] UNSPEC_NONSECURE_MEM)
    (clobber (reg:SI LR_REGNUM))]
   "TARGET_THUMB2 && use_cmse && TARGET_HAVE_FPCXT_CMSE"
   "blxns\\t%0"
@@ -549,10 +549,10 @@
 )
 
 (define_insn "*nonsecure_call_reg_thumb2"
-  [(call (unspec:SI [(mem:SI (reg:SI R4_REGNUM))]
-		    UNSPEC_NONSECURE_MEM)
+  [(call (mem:SI (reg:SI R4_REGNUM))
 	 (match_operand 0 "" ""))
    (use (match_operand 1 "" ""))
+   (unspec:SI [(match_operand 2)] UNSPEC_NONSECURE_MEM)
    (clobber (reg:SI LR_REGNUM))]
   "TARGET_THUMB2 && use_cmse && !TARGET_HAVE_FPCXT_CMSE"
   "bl\\t__gnu_cmse_nonsecure_call"
@@ -573,11 +573,10 @@
 
 (define_insn "*nonsecure_call_value_reg_thumb2_fpcxt"
   [(set (match_operand 0 "" "")
-	(call
-	 (unspec:SI [(mem:SI (match_operand:SI 1 "register_operand" "l*r"))]
-		    UNSPEC_NONSECURE_MEM)
-	 (match_operand 2 "" "")))
+	(call (mem:SI (match_operand:SI 1 "register_operand" "l*r"))
+	(match_operand 2 "" "")))
    (use (match_operand 3 "" ""))
+   (unspec:SI [(match_operand 4)] UNSPEC_NONSECURE_MEM)
    (clobber (reg:SI LR_REGNUM))]
   "TARGET_THUMB2 && use_cmse && TARGET_HAVE_FPCXT_CMSE"
   "blxns\\t%1"
@@ -587,10 +586,10 @@
 
 (define_insn "*nonsecure_call_value_reg_thumb2"
   [(set (match_operand 0 "" "")
-	(call
-	 (unspec:SI [(mem:SI (reg:SI R4_REGNUM))] UNSPEC_NONSECURE_MEM)
-	 (match_operand 1 "" "")))
+	(call (mem:SI (reg:SI R4_REGNUM))
+	      (match_operand 1 "" "")))
    (use (match_operand 2 "" ""))
+   (unspec:SI [(match_operand 3)] UNSPEC_NONSECURE_MEM)
    (clobber (reg:SI LR_REGNUM))]
   "TARGET_THUMB2 && use_cmse && !TARGET_HAVE_FPCXT_CMSE"
   "bl\\t__gnu_cmse_nonsecure_call"
diff --git a/gcc/config/avr/avr-dimode.md b/gcc/config/avr/avr-dimode.md
index 903bfbf..66ba5a9 100644
--- a/gcc/config/avr/avr-dimode.md
+++ b/gcc/config/avr/avr-dimode.md
@@ -101,10 +101,8 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL8 ACC_A)
-                   (plus:ALL8 (reg:ALL8 ACC_A)
-                              (reg:ALL8 ACC_B)))
-   (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*add<mode>3_insn"
   [(set (reg:ALL8 ACC_A)
@@ -122,10 +120,8 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:DI ACC_A)
-                   (plus:DI (reg:DI ACC_A)
-                            (sign_extend:DI (reg:QI REG_X))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*adddi3_const8_insn"
   [(set (reg:DI ACC_A)
@@ -146,12 +142,10 @@
                    (match_operand:ALL8 0 "const_operand" "n Ynn")))]
   "avr_have_dimode
    && !s8_operand (operands[0], VOIDmode)"
-   "#"
-   "&& reload_completed"
-   [(parallel [(set (reg:ALL8 ACC_A)
-                    (plus:ALL8 (reg:ALL8 ACC_A)
-                               (match_dup 0)))
-               (clobber (reg:CC REG_CC))])])
+  "#"
+  "&& reload_completed"
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*add<mode>3_const_insn"
   [(set (reg:ALL8 ACC_A)
@@ -211,10 +205,8 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL8 ACC_A)
-                   (minus:ALL8 (reg:ALL8 ACC_A)
-                               (reg:ALL8 ACC_B)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sub<mode>3_insn"
   [(set (reg:ALL8 ACC_A)
@@ -236,10 +228,8 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL8 ACC_A)
-                   (minus:ALL8 (reg:ALL8 ACC_A)
-                               (match_dup 0)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sub<mode>3_const_insn"
   [(set (reg:ALL8 ACC_A)
@@ -288,10 +278,8 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL8S ACC_A)
-                   (ss_addsub:ALL8S (reg:ALL8S ACC_A)
-                                    (reg:ALL8S ACC_B)))
-             (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code_stdname><mode>3_insn"
   [(set (reg:ALL8S ACC_A)
@@ -309,10 +297,8 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL8S ACC_A)
-                   (ss_addsub:ALL8S (reg:ALL8S ACC_A)
-                                    (match_dup 0)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code_stdname><mode>3_const_insn"
   [(set (reg:ALL8S ACC_A)
@@ -361,10 +347,8 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL8U ACC_A)
-                   (us_addsub:ALL8U (reg:ALL8U ACC_A)
-                                    (reg:ALL8U ACC_B)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code_stdname><mode>3_insn"
   [(set (reg:ALL8U ACC_A)
@@ -382,10 +366,8 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL8U ACC_A)
-                   (us_addsub:ALL8U (reg:ALL8U ACC_A)
-                                    (match_operand:ALL8U 0 "const_operand" "n Ynn")))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code_stdname><mode>3_const_insn"
   [(set (reg:ALL8U ACC_A)
@@ -421,9 +403,8 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:DI ACC_A)
-                   (neg:DI (reg:DI ACC_A)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*negdi2_insn"
   [(set (reg:DI ACC_A)
@@ -500,7 +481,7 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(const_int 0)]
+  [(scratch)]
   {
     emit_insn (gen_compare_<mode>2 ());
     emit_jump_insn (gen_conditional_jump (operands[0], operands[1]));
@@ -529,7 +510,7 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(const_int 0)]
+  [(scratch)]
   {
     emit_insn (gen_compare_const8_di2 ());
     emit_jump_insn (gen_conditional_jump (operands[0], operands[1]));
@@ -556,7 +537,7 @@
    && !s8_operand (operands[1], VOIDmode)"
   "#"
   "&& reload_completed"
-  [(const_int 0)]
+  [(scratch)]
   {
     emit_insn (gen_compare_const_<mode>2 (operands[1], operands[3]));
     emit_jump_insn (gen_conditional_jump (operands[0], operands[2]));
@@ -629,10 +610,8 @@
   "avr_have_dimode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL8 ACC_A)
-                   (di_shifts:ALL8 (reg:ALL8 ACC_A)
-                                   (reg:QI 16)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code_stdname><mode>3_insn"
   [(set (reg:ALL8 ACC_A)
@@ -674,14 +653,10 @@
    (clobber (reg:HI REG_Z))]
   "avr_have_dimode
    && AVR_HAVE_MUL"
-   "#"
-   "&& reload_completed"
-   [(parallel [(set (reg:DI ACC_A)
-                    (mult:DI (any_extend:DI (reg:SI 18))
-                             (any_extend:DI (reg:SI 22))))
-               (clobber (reg:HI REG_X))
-               (clobber (reg:HI REG_Z))
-               (clobber (reg:CC REG_CC))])])
+  "#"
+  "&& reload_completed"
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<extend_u>mulsidi3_insn"
   [(set (reg:DI ACC_A)
diff --git a/gcc/config/avr/avr-fixed.md b/gcc/config/avr/avr-fixed.md
index ce46beb..22061fc 100644
--- a/gcc/config/avr/avr-fixed.md
+++ b/gcc/config/avr/avr-fixed.md
@@ -62,10 +62,8 @@
   "<FIXED_B:MODE>mode != <FIXED_A:MODE>mode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (fract_convert:FIXED_A
-                    (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*fract<FIXED_B:mode><FIXED_A:mode>2"
   [(set (match_operand:FIXED_A 0 "register_operand" "=r")
@@ -86,10 +84,8 @@
   "<FIXED_B:MODE>mode != <FIXED_A:MODE>mode"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (unsigned_fract_convert:FIXED_A
-                    (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*fractuns<FIXED_B:mode><FIXED_A:mode>2"
   [(set (match_operand:FIXED_A 0 "register_operand" "=r")
@@ -124,10 +120,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ss_addsub:ALL124S (match_dup 1)
-                                      (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code_stdname><mode>3"
   [(set (match_operand:ALL124S 0 "register_operand"                          "=??d,d")
@@ -149,10 +143,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (us_addsub:ALL124U (match_dup 1)
-                                      (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code_stdname><mode>3"
   [(set (match_operand:ALL124U 0 "register_operand"                          "=??r,d")
@@ -189,9 +181,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ss_neg:QQ (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*ssnegqq2"
   [(set (match_operand:QQ 0 "register_operand"            "=r")
@@ -207,9 +198,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ss_abs:QQ (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*ssabsqq2"
   [(set (match_operand:QQ 0 "register_operand"            "=r")
@@ -241,9 +231,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL2S 24)
-                   (ss_abs_neg:ALL2S (reg:ALL2S 24)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code_stdname><mode>2"
   [(set (reg:ALL2S 24)
@@ -261,9 +250,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL4S 22)
-                   (ss_abs_neg:ALL4S (reg:ALL4S 22)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code_stdname><mode>2"
   [(set (reg:ALL4S 22)
@@ -296,10 +284,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:QQ (match_dup 1)
-                            (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulqq3_enh"
   [(set (match_operand:QQ 0 "register_operand"         "=r")
@@ -317,10 +303,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:UQQ (match_dup 1)
-                             (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*muluqq3_enh"
   [(set (match_operand:UQQ 0 "register_operand"          "=r")
@@ -377,12 +361,8 @@
   "!AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:QQ 23)
-                   (mult:QQ (reg:QQ 24)
-                            (reg:QQ 25)))
-              (clobber (reg:QI 22))
-              (clobber (reg:HI 24))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulqq3.call"
   [(set (reg:QQ 23)
@@ -425,11 +405,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL2QA 24)
-                   (mult:ALL2QA (reg:ALL2QA 18)
-                                (reg:ALL2QA 26)))
-              (clobber (reg:HI 22))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mul<mode>3.call"
   [(set (reg:ALL2QA 24)
@@ -468,10 +445,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL4A 24)
-                   (mult:ALL4A (reg:ALL4A 16)
-                               (reg:ALL4A 20)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mul<mode>3.call"
   [(set (reg:ALL4A 24)
@@ -514,11 +489,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL1Q 24)
-                   (usdiv:ALL1Q (reg:ALL1Q 25)
-                                (reg:ALL1Q 22)))
-              (clobber (reg:QI 25))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code><mode>3.call"
   [(set (reg:ALL1Q 24)
@@ -560,12 +532,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL2QA 24)
-                   (usdiv:ALL2QA (reg:ALL2QA 26)
-                                 (reg:ALL2QA 22)))
-              (clobber (reg:HI 26))
-              (clobber (reg:QI 21))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code><mode>3.call"
   [(set (reg:ALL2QA 24)
@@ -608,12 +576,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL4A 22)
-                   (usdiv:ALL4A (reg:ALL4A 24)
-                                (reg:ALL4A 18)))
-              (clobber (reg:HI 26))
-              (clobber (reg:HI 30))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<code><mode>3.call"
   [(set (reg:ALL4A 22)
@@ -684,12 +648,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (unspec:ALL124QA [(match_dup 1)
-                                     (match_dup 2)
-                                     (const_int 0)]
-                                    UNSPEC_ROUND))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*round<mode>3_const"
   [(set (match_operand:ALL124QA 0 "register_operand"                  "=d")
@@ -714,11 +674,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL1Q 24)
-                   (unspec:ALL1Q [(reg:ALL1Q 22)
-                                  (reg:QI 24)] UNSPEC_ROUND))
-              (clobber (reg:ALL1Q 22))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*round<mode>3.libgcc"
   [(set (reg:ALL1Q 24)
@@ -740,11 +697,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL2QA 24)
-                   (unspec:ALL2QA [(reg:ALL2QA 22)
-                                   (reg:QI 24)] UNSPEC_ROUND))
-              (clobber (reg:ALL2QA 22))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*round<mode>3.libgcc"
   [(set (reg:ALL2QA 24)
@@ -766,11 +720,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:ALL4QA 22)
-                   (unspec:ALL4QA [(reg:ALL4QA 18)
-                                   (reg:QI 24)] UNSPEC_ROUND))
-              (clobber (reg:ALL4QA 18))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*round<mode>3.libgcc"
   [(set (reg:ALL4QA 22)
diff --git a/gcc/config/avr/avr-log.cc b/gcc/config/avr/avr-log.cc
index fadb3ca..972ba6b 100644
--- a/gcc/config/avr/avr-log.cc
+++ b/gcc/config/avr/avr-log.cc
@@ -373,7 +373,6 @@ avr_log_set_avr_log (void)
       SET_DUMP_DETAIL (insn_addresses);
       SET_DUMP_DETAIL (legitimate_address_p);
       SET_DUMP_DETAIL (legitimize_address);
-      SET_DUMP_DETAIL (legitimize_reload_address);
       SET_DUMP_DETAIL (progmem);
       SET_DUMP_DETAIL (rtx_costs);
 
diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc
index 6a88a27..69df6d2 100644
--- a/gcc/config/avr/avr-passes.cc
+++ b/gcc/config/avr/avr-passes.cc
@@ -4843,6 +4843,137 @@ avr_pass_fuse_add::execute1 (function *func)
 
 
 //////////////////////////////////////////////////////////////////////////////
+// Fuse 2 move insns after combine.
+
+static const pass_data avr_pass_data_2moves =
+{
+  RTL_PASS,	    // type
+  "",		    // name (will be patched)
+  OPTGROUP_NONE,    // optinfo_flags
+  TV_DF_SCAN,	    // tv_id
+  0,		    // properties_required
+  0,		    // properties_provided
+  0,		    // properties_destroyed
+  0,		    // todo_flags_start
+  0		    // todo_flags_finish
+};
+
+class avr_pass_2moves : public rtl_opt_pass
+{
+public:
+  avr_pass_2moves (gcc::context *ctxt, const char *name)
+    : rtl_opt_pass (avr_pass_data_2moves, ctxt)
+  {
+    this->name = name;
+  }
+
+  unsigned int execute (function *func) final override
+  {
+    if (optimize && avropt_fuse_move2)
+      {
+	bool changed = false;
+	basic_block bb;
+
+	FOR_EACH_BB_FN (bb, func)
+	  {
+	    changed |= optimize_2moves_bb (bb);
+	  }
+
+	if (changed)
+	  {
+	    df_note_add_problem ();
+	    df_analyze ();
+	  }
+      }
+
+    return 0;
+  }
+
+  bool optimize_2moves (rtx_insn *, rtx_insn *);
+  bool optimize_2moves_bb (basic_block);
+}; // avr_pass_2moves
+
+bool
+avr_pass_2moves::optimize_2moves_bb (basic_block bb)
+{
+  bool changed = false;
+  rtx_insn *insn1 = nullptr;
+  rtx_insn *insn2 = nullptr;
+  rtx_insn *curr;
+
+  FOR_BB_INSNS (bb, curr)
+    {
+      if (insn1 && INSN_P (insn1)
+	  && insn2 && INSN_P (insn2))
+	changed |= optimize_2moves (insn1, insn2);
+
+      insn1 = insn2;
+      insn2 = curr;
+    }
+
+  return changed;
+}
+
+bool
+avr_pass_2moves::optimize_2moves (rtx_insn *insn1, rtx_insn *insn2)
+{
+  bool good = false;
+  bool bad = false;
+  rtx set1, dest1, src1;
+  rtx set2, dest2, src2;
+
+  if ((set1 = single_set (insn1))
+      && (set2 = single_set (insn2))
+      && (src1 = SET_SRC (set1))
+      && REG_P (src2 = SET_SRC (set2))
+      && REG_P (dest1 = SET_DEST (set1))
+      && REG_P (dest2 = SET_DEST (set2))
+      && rtx_equal_p (dest1, src2)
+      // Now we have:
+      // insn1: dest1 = src1
+      // insn2: dest2 = dest1
+      && REGNO (dest1) >= FIRST_PSEUDO_REGISTER
+      // Paranoia.
+      && GET_CODE (PATTERN (insn1)) != PARALLEL
+      && GET_CODE (PATTERN (insn2)) != PARALLEL
+      && (rtx_equal_p (dest2, src1)
+	  || !reg_overlap_mentioned_p (dest2, src1)))
+    {
+      avr_dump ("\n;; Found 2moves:\n%r\n%r\n", insn1, insn2);
+      avr_dump (";; reg %d: insn uses uids:", REGNO (dest1));
+
+      // Go check that dest1 is used exactly once, namely by insn2.
+
+      df_ref use = DF_REG_USE_CHAIN (REGNO (dest1));
+      for (; use; use = DF_REF_NEXT_REG (use))
+	{
+	  rtx_insn *user = DF_REF_INSN (use);
+	  avr_dump (" %d", INSN_UID (user));
+	  good |= INSN_UID (user) == INSN_UID (insn2);
+	  bad |= INSN_UID (user) != INSN_UID (insn2);
+	}
+      avr_dump (".\n");
+
+      if (good && !bad
+	  // Propagate src1 to insn2:
+	  // insn1: # Deleted
+	  // insn2: dest2 = src1
+	  && validate_change (insn2, &SET_SRC (set2), src1, false))
+	{
+	  SET_INSN_DELETED (insn1);
+	  return true;
+	}
+    }
+
+  if (good && !bad)
+    avr_dump (";; Failed\n");
+
+  return false;
+}
+
+
+
+//////////////////////////////////////////////////////////////////////////////
 // Split insns with nonzero_bits() after combine.
 
 static const pass_data avr_pass_data_split_nzb =
@@ -5704,6 +5835,14 @@ make_avr_pass_casesi (gcc::context *ctxt)
   return new avr_pass_casesi (ctxt, "avr-casesi");
 }
 
+// Optimize 2 consecutive moves after combine.
+
+rtl_opt_pass *
+make_avr_pass_2moves (gcc::context *ctxt)
+{
+  return new avr_pass_2moves (ctxt, "avr-2moves");
+}
+
 rtl_opt_pass *
 make_avr_pass_split_nzb (gcc::context *ctxt)
 {
diff --git a/gcc/config/avr/avr-passes.def b/gcc/config/avr/avr-passes.def
index eb60a93..d668c7f 100644
--- a/gcc/config/avr/avr-passes.def
+++ b/gcc/config/avr/avr-passes.def
@@ -74,6 +74,14 @@ INSERT_PASS_BEFORE (pass_free_cfg, 1, avr_pass_recompute_notes);
 
 INSERT_PASS_AFTER (pass_expand, 1, avr_pass_casesi);
 
+/* Insn combine may come up with superfluous reg-reg moves, where the combine
+   people say that these are no problem since reg-alloc is supposed to optimize
+   them.  The issue is that the lower-subreg pass sitting between combine and
+   reg-alloc may split such moves, coming up with a zoo of subregs which are
+   only handled poorly by the register allocator.  */
+
+INSERT_PASS_AFTER (pass_combine, 1, avr_pass_2moves);
+
 /* Some combine insns have nonzero_bits() in their condition, though insns
    should not use such stuff in their condition.  Therefore, we split such
    insn into something without nonzero_bits() in their condition right after
diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index ca30136..8ba1945 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -146,7 +146,6 @@ extern void out_shift_with_cnt (const char *templ, rtx_insn *insn,
 extern enum reg_class avr_mode_code_base_reg_class (machine_mode, addr_space_t, rtx_code, rtx_code);
 extern bool avr_regno_mode_code_ok_for_base_p (int, machine_mode, addr_space_t, rtx_code, rtx_code);
 extern rtx avr_incoming_return_addr_rtx (void);
-extern rtx avr_legitimize_reload_address (rtx*, machine_mode, int, int, int, int, rtx (*)(rtx,int));
 extern bool avr_adiw_reg_p (rtx);
 extern bool avr_mem_flash_p (rtx);
 extern bool avr_mem_flashx_p (rtx);
@@ -168,6 +167,8 @@ regmask (machine_mode mode, unsigned regno)
 
 extern void avr_fix_inputs (rtx*, unsigned, unsigned);
 extern bool avr_emit3_fix_outputs (rtx (*)(rtx,rtx,rtx), rtx*, unsigned, unsigned);
+extern rtx avr_add_ccclobber (rtx_insn *);
+#define DONE_ADD_CCC emit (avr_add_ccclobber (curr_insn)); DONE;
 
 extern rtx lpm_reg_rtx;
 extern rtx lpm_addr_reg_rtx;
@@ -208,6 +209,7 @@ extern rtl_opt_pass *make_avr_pass_casesi (gcc::context *);
 extern rtl_opt_pass *make_avr_pass_ifelse (gcc::context *);
 extern rtl_opt_pass *make_avr_pass_split_nzb (gcc::context *);
 extern rtl_opt_pass *make_avr_pass_split_after_peephole2 (gcc::context *);
+extern rtl_opt_pass *make_avr_pass_2moves (gcc::context *);
 #ifdef RTX_CODE
 extern bool avr_casei_sequence_check_operands (rtx *xop);
 extern bool avr_split_fake_addressing_move (rtx_insn *insn, rtx *operands);
@@ -238,7 +240,6 @@ typedef struct
   unsigned insn_addresses :1;
   unsigned legitimate_address_p :1;
   unsigned legitimize_address :1;
-  unsigned legitimize_reload_address :1;
   unsigned progmem :1;
   unsigned rtx_costs :1;
 } avr_log_t;
diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc
index c469297..ae49d4d 100644
--- a/gcc/config/avr/avr.cc
+++ b/gcc/config/avr/avr.cc
@@ -411,6 +411,29 @@ avr_to_int_mode (rtx x)
 }
 
 
+/* Return the pattern of INSN, but with added (clobber (reg:CC REG_CC)).
+   The pattern of INSN must be a PARALLEL or a SET.  INSN is unchanged.  */
+
+rtx
+avr_add_ccclobber (rtx_insn *insn)
+{
+  rtx pat = PATTERN (insn);
+  gcc_assert (GET_CODE (pat) == SET || GET_CODE (pat) == PARALLEL);
+
+  int newlen = GET_CODE (pat) == SET ? 2 : 1 + XVECLEN (pat, 0);
+  rtx newpat = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (newlen));
+  rtx elt0 = GET_CODE (pat) == SET ? pat : XVECEXP (pat, 0, 0);
+
+  XVECEXP (newpat, 0, 0) = copy_rtx (elt0);
+  XVECEXP (newpat, 0, newlen - 1) = gen_rtx_CLOBBER (VOIDmode, cc_reg_rtx);
+
+  for (int i = 1; i < newlen - 1; ++i)
+    XVECEXP (newpat, 0, i) = copy_rtx (XVECEXP (pat, 0, i));
+
+  return newpat;
+}
+
+
 /* Return true if hard register REG supports the ADIW and SBIW instructions.  */
 
 bool
@@ -430,13 +453,6 @@ avr_ld_regno_p (int regno)
 }
 
 
-static bool
-ra_in_progress ()
-{
-  return avropt_lra_p ? lra_in_progress : reload_in_progress;
-}
-
-
 /* Set `avr_arch' as specified by `-mmcu='.
    Return true on success.  */
 
@@ -2324,8 +2340,8 @@ avr_legitimate_address_p (machine_mode mode, rtx x, bool strict)
   if (avr_log.legitimate_address_p)
     {
       avr_edump ("\n%?: ret=%d, mode=%m strict=%d "
-		 "reload_completed=%d ra_in_progress=%d %s:",
-		 ok, mode, strict, reload_completed, ra_in_progress (),
+		 "reload_completed=%d lra_in_progress=%d %s:",
+		 ok, mode, strict, reload_completed, lra_in_progress,
 		 reg_renumber ? "(reg_renumber)" : "");
 
       if (GET_CODE (x) == PLUS
@@ -2395,88 +2411,6 @@ avr_legitimize_address (rtx x, rtx oldx, machine_mode mode)
 }
 
 
-/* Implement `LEGITIMIZE_RELOAD_ADDRESS'.  */
-/* This will allow register R26/27 to be used where it is no worse than normal
-   base pointers R28/29 or R30/31.  For example, if base offset is greater
-   than 63 bytes or for R++ or --R addressing.  */
-
-rtx
-avr_legitimize_reload_address (rtx *px, machine_mode mode, int opnum,
-			       int type, int addr_type, int /*ind_levels*/,
-			       rtx (*mk_memloc)(rtx,int))
-{
-  rtx x = *px;
-
-  if (avr_log.legitimize_reload_address)
-    avr_edump ("\n%?:%m %r\n", mode, x);
-
-  if (1 && (GET_CODE (x) == POST_INC
-	    || GET_CODE (x) == PRE_DEC))
-    {
-      push_reload (XEXP (x, 0), XEXP (x, 0), &XEXP (x, 0), &XEXP (x, 0),
-		   POINTER_REGS, GET_MODE (x), GET_MODE (x), 0, 0,
-		   opnum, RELOAD_OTHER);
-
-      if (avr_log.legitimize_reload_address)
-	avr_edump (" RCLASS.1 = %R\n IN = %r\n OUT = %r\n",
-		   POINTER_REGS, XEXP (x, 0), XEXP (x, 0));
-
-      return x;
-    }
-
-  if (GET_CODE (x) == PLUS
-      && REG_P (XEXP (x, 0))
-      && reg_equiv_constant (REGNO (XEXP (x, 0))) == 0
-      && CONST_INT_P (XEXP (x, 1))
-      && INTVAL (XEXP (x, 1)) >= 1)
-    {
-      bool fit = INTVAL (XEXP (x, 1)) <= MAX_LD_OFFSET (mode);
-
-      if (fit)
-	{
-	  if (reg_equiv_address (REGNO (XEXP (x, 0))) != 0)
-	    {
-	      int regno = REGNO (XEXP (x, 0));
-	      rtx mem = mk_memloc (x, regno);
-
-	      push_reload (XEXP (mem, 0), NULL_RTX, &XEXP (mem, 0), NULL,
-			   POINTER_REGS, Pmode, VOIDmode, 0, 0,
-			   1, (enum reload_type) addr_type);
-
-	      if (avr_log.legitimize_reload_address)
-		avr_edump (" RCLASS.2 = %R\n IN = %r\n OUT = %r\n",
-			   POINTER_REGS, XEXP (mem, 0), NULL_RTX);
-
-	      push_reload (mem, NULL_RTX, &XEXP (x, 0), NULL,
-			   BASE_POINTER_REGS, GET_MODE (x), VOIDmode, 0, 0,
-			   opnum, (enum reload_type) type);
-
-	      if (avr_log.legitimize_reload_address)
-		avr_edump (" RCLASS.2 = %R\n IN = %r\n OUT = %r\n",
-			   BASE_POINTER_REGS, mem, NULL_RTX);
-
-	      return x;
-	    }
-	}
-      else if (! (frame_pointer_needed
-		  && XEXP (x, 0) == frame_pointer_rtx))
-	{
-	  push_reload (x, NULL_RTX, px, NULL,
-		       POINTER_REGS, GET_MODE (x), VOIDmode, 0, 0,
-		       opnum, (enum reload_type) type);
-
-	  if (avr_log.legitimize_reload_address)
-	    avr_edump (" RCLASS.3 = %R\n IN = %r\n OUT = %r\n",
-		       POINTER_REGS, x, NULL_RTX);
-
-	  return x;
-	}
-    }
-
-  return NULL_RTX;
-}
-
-
 /* Helper function to print assembler resp. track instruction
    sequence lengths.  Always return "".
 
@@ -12824,6 +12758,16 @@ avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code,
       return true;
 
     case SIGN_EXTEND:
+      if (GET_CODE (XEXP (x, 0)) == ASHIFT
+	  && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
+	{
+	  // "*sext.ashift<QIPSI:mode><HISI:mode>2_split"
+	  int m0 = GET_MODE_SIZE (GET_MODE (XEXP (x, 0)));
+	  int m1 = GET_MODE_SIZE (mode);
+	  *total = COSTS_N_INSNS (m0 * INTVAL (XEXP (XEXP (x, 0), 1))
+				  + m1 - m0);
+	  return true;
+	}
       *total = COSTS_N_INSNS (n_bytes + 2
 			      - GET_MODE_SIZE (GET_MODE (XEXP (x, 0))));
       *total += avr_operand_rtx_cost (XEXP (x, 0), GET_MODE (XEXP (x, 0)),
@@ -13936,8 +13880,8 @@ extra_constraint_Q (rtx x)
 	    || xx == arg_pointer_rtx);
 
       if (avr_log.constraints)
-	avr_edump ("\n%?=%d reload_completed=%d ra_in_progress=%d\n %r\n",
-		   ok, reload_completed, ra_in_progress (), x);
+	avr_edump ("\n%?=%d reload_completed=%d lra_in_progress=%d\n %r\n",
+		   ok, reload_completed, lra_in_progress, x);
     }
 
   return ok;
@@ -14142,17 +14086,6 @@ avr_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
   if (GET_MODE_SIZE (mode) == 1)
     return true;
 
-  /* FIXME: Ideally, the following test is not needed.
-	However, it turned out that it can reduce the number
-	of spill fails.  AVR and it's poor endowment with
-	address registers is extreme stress test for reload.  */
-
-  if (GET_MODE_SIZE (mode) >= 4
-      && regno + GET_MODE_SIZE (mode) >= REG_30
-      // This problem only concerned the old reload.
-      && ! avropt_lra_p)
-    return false;
-
   /* All modes larger than 8 bits should start in an even register.  */
 
   return !(regno & 1);
@@ -14418,6 +14351,13 @@ avr_output_addr_vec (rtx_insn *labl, rtx table)
   // Output the label that precedes the table.
 
   ASM_OUTPUT_ALIGN (stream, 1);
+
+  char s_labl[40];
+  targetm.asm_out.generate_internal_label (s_labl, "L",
+					   CODE_LABEL_NUMBER (labl));
+  ASM_OUTPUT_TYPE_DIRECTIVE (stream, s_labl,
+			     AVR_HAVE_JMP_CALL ? "object" : "function");
+
   targetm.asm_out.internal_label (stream, "L", CODE_LABEL_NUMBER (labl));
 
   // Output the table's content.
@@ -14907,8 +14847,8 @@ avr_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
   if (avr_log.legitimate_address_p)
     {
       avr_edump ("\n%?: ret=%b, mode=%m strict=%d "
-		 "reload_completed=%d ra_in_progress=%d %s:",
-		 ok, mode, strict, reload_completed, ra_in_progress (),
+		 "reload_completed=%d lra_in_progress=%d %s:",
+		 ok, mode, strict, reload_completed, lra_in_progress,
 		 reg_renumber ? "(reg_renumber)" : "");
 
       if (GET_CODE (x) == PLUS
@@ -14984,10 +14924,11 @@ avr_addr_space_convert (rtx src, tree type_old, tree type_new)
 
       /* Linearize memory: RAM has bit 23 set.  When as_new = __flashx then
 	 this is basically UB since __flashx mistreats RAM addresses, but there
-	 is no way to bail out.  (Though -Waddr-space-convert will tell.)  */
+	 is no way to bail out.  (Though -Waddr-space-convert will tell.)
+	 ...but PR121277 is confusing, in particular when NULL is coming in. */
 
       int msb = ADDR_SPACE_GENERIC_P (as_old)
-	? 0x80
+	? as_new == ADDR_SPACE_MEMX ? 0x80 : 0x00
 	: avr_addrspace[as_old].segment;
 
       src = force_reg (Pmode, src);
@@ -15085,10 +15026,16 @@ avr_convert_to_type (tree type, tree expr)
 	  const char *name_old = avr_addrspace[as_old].name;
 	  const char *name_new = avr_addrspace[as_new].name;
 
-	  warning (OPT_Waddr_space_convert,
-		   "conversion from address space %qs to address space %qs",
-		   ADDR_SPACE_GENERIC_P (as_old) ? "generic" : name_old,
-		   ADDR_SPACE_GENERIC_P (as_new) ? "generic" : name_new);
+	  // Be relaxed when NULL is used, and when 0x0 stands for
+	  // address 0x0.
+	  bool nowarn = (expr == null_pointer_node
+			 && (as_new == ADDR_SPACE_FLASHX
+			     || as_new == ADDR_SPACE_FLASH));
+	  if (!nowarn)
+	    warning (OPT_Waddr_space_convert,
+		     "conversion from address space %qs to address space %qs",
+		     ADDR_SPACE_GENERIC_P (as_old) ? "generic" : name_old,
+		     ADDR_SPACE_GENERIC_P (as_new) ? "generic" : name_new);
 
 	  return fold_build1_loc (loc, ADDR_SPACE_CONVERT_EXPR, type, expr);
 	}
@@ -16679,15 +16626,6 @@ avr_unwind_word_mode ()
   return Pmode;
 }
 
-
-/* Implement `TARGET_LRA_P'.  */
-
-static bool
-avr_use_lra_p ()
-{
-  return avropt_lra_p;
-}
-
 
 
 /* Initialize the GCC target structure.  */
@@ -16829,9 +16767,6 @@ avr_use_lra_p ()
 #undef  TARGET_CONVERT_TO_TYPE
 #define TARGET_CONVERT_TO_TYPE avr_convert_to_type
 
-#undef TARGET_LRA_P
-#define TARGET_LRA_P avr_use_lra_p
-
 #undef  TARGET_ADDR_SPACE_SUBSET_P
 #define TARGET_ADDR_SPACE_SUBSET_P avr_addr_space_subset_p
 
diff --git a/gcc/config/avr/avr.h b/gcc/config/avr/avr.h
index cb818c3..335f9fa5 100644
--- a/gcc/config/avr/avr.h
+++ b/gcc/config/avr/avr.h
@@ -309,12 +309,6 @@ enum reg_class {
 
 #define STATIC_CHAIN_REGNUM ((AVR_TINY) ? 18 :2)
 
-#define RELOAD_ELIMINABLE_REGS {				\
-    { ARG_POINTER_REGNUM, STACK_POINTER_REGNUM },               \
-    { ARG_POINTER_REGNUM, FRAME_POINTER_REGNUM },               \
-    { FRAME_POINTER_REGNUM, STACK_POINTER_REGNUM },             \
-    { FRAME_POINTER_REGNUM + 1, STACK_POINTER_REGNUM + 1 } }
-
 #define ELIMINABLE_REGS						\
   {								\
     { ARG_POINTER_REGNUM, STACK_POINTER_REGNUM },		\
@@ -358,18 +352,6 @@ typedef struct avr_args
 
 #define MAX_REGS_PER_ADDRESS 1
 
-#define LEGITIMIZE_RELOAD_ADDRESS(X,MODE,OPNUM,TYPE,IND_L,WIN)          \
-  do {                                                                  \
-    rtx new_x = avr_legitimize_reload_address (&(X), MODE, OPNUM, TYPE, \
-                                               ADDR_TYPE (TYPE),        \
-                                               IND_L, make_memloc);     \
-    if (new_x)                                                          \
-      {                                                                 \
-        X = new_x;                                                      \
-        goto WIN;                                                       \
-      }                                                                 \
-  } while (0)
-
 /* We increase branch costs after reload in order to keep basic-block
    reordering from introducing out-of-line jumps and to prefer fall-through
    edges instead.  The default branch costs are 0, mainly because otherwise
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index f8bbdc7..60b1f60 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -574,9 +574,8 @@
    && REG_Z == REGNO (XEXP (operands[0], 0))"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:MOVMODE 22)
-                   (match_dup 0))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*load_<mode>_libgcc"
   [(set (reg:MOVMODE 22)
@@ -716,14 +715,8 @@
     || avr_load_libgcc_insn_p (insn, ADDR_SPACE_FLASHX, true)"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:MOVMODE REG_22)
-                   (match_dup 0))
-              (clobber (reg:QI REG_21))
-              (clobber (reg:HI REG_Z))
-              (clobber (reg:CC REG_CC))])]
-  {
-    operands[0] = SET_SRC (single_set (curr_insn));
-  })
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*fxload_<mode>_libgcc"
   [(set (reg:MOVMODE REG_22)
@@ -853,9 +846,8 @@
     || reg_or_0_operand (operands[1], <MODE>mode)"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (match_dup 1))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 ;; "movqi_insn"
 ;; "movqq_insn" "movuqq_insn"
@@ -964,9 +956,8 @@
    || reg_or_0_operand (operands[1], <MODE>mode)"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (match_dup 1))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mov<mode>"
   [(set (match_operand:ALL2 0 "nonimmediate_operand" "=r,r  ,r,m    ,d,*r,q,r")
@@ -1137,9 +1128,8 @@
    || const0_rtx == operands[1]"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (match_dup 1))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*movpsi"
   [(set (match_operand:PSI 0 "nonimmediate_operand" "=r,r,r ,Qm,!d,r")
@@ -1197,9 +1187,8 @@
    || reg_or_0_operand (operands[1], <MODE>mode)"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (match_dup 1))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mov<mode>"
   [(set (match_operand:ALL4 0 "nonimmediate_operand" "=r,r  ,r ,Qm   ,!d,r")
@@ -1245,9 +1234,8 @@
    || reg_or_0_operand (operands[1], SFmode)"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (match_dup 1))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*movsf"
   [(set (match_operand:SF 0 "nonimmediate_operand" "=r,r,r ,Qm,!d,r")
@@ -1326,16 +1314,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (mem:BLK (reg:HI REG_X))
-                   (mem:BLK (reg:HI REG_Z)))
-              (unspec [(match_dup 0)]
-                      UNSPEC_CPYMEM)
-              (use (match_dup 1))
-              (clobber (reg:HI REG_X))
-              (clobber (reg:HI REG_Z))
-              (clobber (reg:QI LPM_REGNO))
-              (clobber (match_dup 2))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*cpymem_<mode>"
   [(set (mem:BLK (reg:HI REG_X))
@@ -1382,22 +1362,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (mem:BLK (reg:HI REG_X))
-                   (match_dup 2))
-              (unspec [(match_dup 0)]
-                      UNSPEC_CPYMEM)
-              (use (reg:QIHI 24))
-              (clobber (reg:HI REG_X))
-              (clobber (reg:HI REG_Z))
-              (clobber (reg:QI LPM_REGNO))
-              (clobber (reg:HI 24))
-              (clobber (reg:QI 23))
-              (clobber (mem:QI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])]
-  {
-    rtx xset = XVECEXP (PATTERN (curr_insn), 0, 0);
-    operands[2] = SET_SRC (xset);
-  })
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*cpymemx_<mode>"
   [(set (mem:BLK (reg:HI REG_X))
@@ -1461,13 +1427,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (mem:BLK (match_dup 0))
-                   (const_int 0))
-              (use (match_dup 1))
-              (use (match_dup 2))
-              (clobber (match_dup 3))
-              (clobber (match_dup 4))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*clrmemqi"
   [(set (mem:BLK (match_operand:HI 0 "register_operand" "e"))
@@ -1492,14 +1453,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (mem:BLK (match_dup 0))
-                   (const_int 0))
-              (use (match_dup 1))
-              (use (match_dup 2))
-              (clobber (match_dup 3))
-              (clobber (match_dup 4))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "adiw,*")])
 
 
@@ -1550,13 +1505,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel
-      [(set (match_dup 0)
-            (unspec:HI [(mem:BLK (match_dup 1))
-                        (const_int 0)
-                        (match_dup 2)]
-                       UNSPEC_STRLEN))
-       (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*strlenhi"
   [(set (match_operand:HI 0 "register_operand"                      "=e")
@@ -1581,10 +1531,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:ALL1 (match_dup 1)
-                              (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*add<mode>3"
   [(set (match_operand:ALL1 0 "register_operand"            "=r,d    ,r  ,r  ,r  ,r")
@@ -1640,10 +1588,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:HI (zero_extend:HI (match_dup 1))
-                            (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*addhi3_zero_extend"
   [(set (match_operand:HI 0 "register_operand"                         "=r,*?r")
@@ -1663,10 +1609,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:HI (match_dup 1)
-                            (zero_extend:HI (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*addhi3_zero_extend1"
   [(set (match_operand:HI 0 "register_operand"                         "=r")
@@ -1684,10 +1628,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:HI (zero_extend:HI (match_dup 1))
-                            (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*addhi3_zero_extend.const"
   [(set (match_operand:HI 0 "register_operand"                         "=d")
@@ -1723,11 +1665,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:HI (ashift:HI (zero_extend:HI (match_dup 1))
-                                       (const_int 1))
-                            (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*addhi3_zero_extend.ashift1"
   [(set (match_operand:HI 0 "register_operand"                                    "=r")
@@ -1752,11 +1691,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:HI (zero_extend:HI (match_dup 1))
-                            (zero_extend:HI (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
-
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*usum_widenqihi3"
   [(set (match_operand:HI 0 "register_operand"                          "=r")
@@ -1774,10 +1710,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (minus:HI (zero_extend:HI (match_dup 1))
-                             (zero_extend:HI (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*udiff_widenqihi3"
   [(set (match_operand:HI 0 "register_operand"                           "=r")
@@ -1797,7 +1731,7 @@
     return avr_out_addto_sp (operands, NULL);
   }
   ""
-  [(const_int 0)]
+  [(scratch)]
   {
     // Do not attempt to split this pattern. This FAIL is necessary
     // to prevent the splitter from matching *add<ALL2>3_split, splitting
@@ -1909,11 +1843,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:ALL2 (match_dup 1)
-                              (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 ;; "*addhi3_clobber"
 ;; "*addhq3_clobber"  "*adduhq3_clobber"
@@ -1943,11 +1874,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:ALL4 (match_dup 1)
-                              (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*add<mode>3"
   [(set (match_operand:ALL4 0 "register_operand"          "=??r,d ,r")
@@ -1979,10 +1907,8 @@
    && (<HISI:SIZE> > 2 || <CODE> == SIGN_EXTEND)"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:HISI (any_extend:HISI (match_dup 1))
-                              (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 ;;                            "*addhi3.sign_extend.qi"
 ;; "*addpsi3.zero_extend.qi"  "*addpsi3.sign_extend.qi"
@@ -2019,10 +1945,8 @@
   "<HISI:SIZE> > <QIPSI:SIZE>"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (minus:HISI (match_dup 1)
-                               (any_extend:HISI (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 ;; "*subhi3.zero_extend.qi"   "*subhi3.sign_extend.qi"
 ;; "*subpsi3.zero_extend.qi"  "*subpsi3.sign_extend.qi"
@@ -2053,11 +1977,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:PSI (match_dup 1)
-                             (match_dup 2)))
-              (clobber (match_dup 3 ))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*addpsi3"
   [(set (match_operand:PSI 0 "register_operand"         "=??r,d ,d,r")
@@ -2079,10 +2000,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (minus:PSI (match_dup 1)
-                              (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*subpsi3"
   [(set (match_operand:PSI 0 "register_operand"           "=r")
@@ -2106,10 +2025,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (minus:ALL1 (match_dup 1)
-                               (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sub<mode>3"
   [(set (match_operand:ALL1 0 "register_operand"                    "=??r,d    ,r  ,r  ,r  ,r")
@@ -2137,11 +2054,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (minus:ALL2 (match_dup 1)
-                               (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sub<mode>3"
   [(set (match_operand:ALL2 0 "register_operand"                    "=??r,d    ,*r")
@@ -2167,11 +2081,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (minus:ALL4 (match_dup 1)
-                               (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sub<mode>3"
   [(set (match_operand:ALL4 0 "register_operand"                    "=??r,d    ,r")
@@ -2209,10 +2120,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:QI (match_dup 1)
-                            (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulqi3_enh"
   [(set (match_operand:QI 0 "register_operand" "=r")
@@ -2243,10 +2152,8 @@
   "!AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:QI 24)
-                   (mult:QI (reg:QI 24) (reg:QI 22)))
-              (clobber (reg:QI 22))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulqi3_call"
   [(set (reg:QI 24)
@@ -2269,12 +2176,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (truncate:QI
-                    (lshiftrt:HI (mult:HI (any_extend:HI (match_dup 1))
-                                          (any_extend:HI (match_dup 2)))
-                                 (const_int 8))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<extend_su>mulqi3_highpart"
   [(set (match_operand:QI 0 "register_operand"                                       "=r")
@@ -2361,21 +2264,21 @@
                                         (const_int 0))))
               (clobber (reg:CC REG_CC))])])
 
-;; *subqi3.lt0   *subqi3.ge0
-;; *subhi3.lt0   *subhi3.ge0
-;; *subpsi3.lt0  *subpsi3.ge0
-;; *subsi3.lt0   *subsi3.ge0
-(define_insn "*sub<QISI:mode>3.<code>0"
-  [(set (match_operand:QISI 0 "register_operand"                        "=r")
-        (minus:QISI (match_operand:QISI 1 "register_operand"             "0")
-                    (gelt:QISI (match_operand:QISI2 2 "register_operand" "r")
-                               (const_int 0))))
-   (clobber (reg:CC REG_CC))]
-  "reload_completed"
-  {
-    return avr_out_add_msb (insn, operands, <CODE>, nullptr);
-  }
-  [(set_attr "adjust_len" "add_<code>0")])
+;; *addqi3.lt0_split   *addqi3.ge0_split
+;; *addhi3.lt0_split   *addhi3.ge0_split
+;; *addpsi3.lt0_split  *addpsi3.ge0_split
+;; *addsi3.lt0_split   *addsi3.ge0_split
+(define_insn_and_split "*add<QISI:mode>3.<code>0_split"
+  [(set (match_operand:QISI 0 "register_operand"                       "=r")
+        (plus:QISI (gelt:QISI (match_operand:QISI2 1 "register_operand" "r")
+                              (const_int 0))
+                   (match_operand:QISI 2 "register_operand"             "0")))]
+  ""
+  "#"
+  "&& reload_completed"
+  ; *add<QISI:mode>3.<code>0
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 ;; *addqi3.lt0   *addqi3.ge0
 ;; *addhi3.lt0   *addhi3.ge0
@@ -2393,25 +2296,6 @@
   }
   [(set_attr "adjust_len" "add_<code>0")])
 
-;; *addqi3.lt0_split   *addqi3.ge0_split
-;; *addhi3.lt0_split   *addhi3.ge0_split
-;; *addpsi3.lt0_split  *addpsi3.ge0_split
-;; *addsi3.lt0_split   *addsi3.ge0_split
-(define_insn_and_split "*add<QISI:mode>3.<code>0_split"
-  [(set (match_operand:QISI 0 "register_operand"                       "=r")
-        (plus:QISI (gelt:QISI (match_operand:QISI2 1 "register_operand" "r")
-                              (const_int 0))
-                   (match_operand:QISI 2 "register_operand"             "0")))]
-  ""
-  "#"
-  "&& reload_completed"
-  [; *add<QISI:mode>3.<code>0
-   (parallel [(set (match_dup 0)
-                   (plus:QISI (gelt:QISI (match_dup 1)
-                                         (const_int 0))
-                              (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
-
 ;; *subqi3.lt0_split   *subqi3.ge0_split
 ;; *subhi3.lt0_split   *subhi3.ge0_split
 ;; *subpsi3.lt0_split  *subpsi3.ge0_split
@@ -2424,13 +2308,25 @@
   ""
   "#"
   "&& reload_completed"
-  [; *sub<QISI:mode>3.<code>0
-   (parallel [(set (match_dup 0)
-                   (minus:QISI (match_dup 1)
-                               (gelt:QISI (match_dup 2)
-                                          (const_int 0))))
-              (clobber (reg:CC REG_CC))])])
+  ; *sub<QISI:mode>3.<code>0
+  [(scratch)]
+  { DONE_ADD_CCC })
 
+;; *subqi3.lt0   *subqi3.ge0
+;; *subhi3.lt0   *subhi3.ge0
+;; *subpsi3.lt0  *subpsi3.ge0
+;; *subsi3.lt0   *subsi3.ge0
+(define_insn "*sub<QISI:mode>3.<code>0"
+  [(set (match_operand:QISI 0 "register_operand"                        "=r")
+        (minus:QISI (match_operand:QISI 1 "register_operand"             "0")
+                    (gelt:QISI (match_operand:QISI2 2 "register_operand" "r")
+                               (const_int 0))))
+   (clobber (reg:CC REG_CC))]
+  "reload_completed"
+  {
+    return avr_out_add_msb (insn, operands, <CODE>, nullptr);
+  }
+  [(set_attr "adjust_len" "add_<code>0")])
 
 (define_insn_and_split "*umulqihi3.call_split"
   [(set (reg:HI 24)
@@ -2441,12 +2337,8 @@
   "!AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (mult:HI (zero_extend:HI (reg:QI 22))
-                            (zero_extend:HI (reg:QI 24))))
-              (clobber (reg:QI 21))
-              (clobber (reg:HI 22))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*umulqihi3.call"
   [(set (reg:HI 24)
@@ -2469,10 +2361,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:HI (any_extend:HI (match_dup 1))
-                            (any_extend:HI (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "<extend_u>mulqihi3"
   [(set (match_operand:HI 0 "register_operand"                         "=r")
@@ -2492,10 +2382,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:HI (zero_extend:HI (match_dup 1))
-                            (sign_extend:HI (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*usmulqihi3"
   [(set (match_operand:HI 0 "register_operand"                         "=r")
@@ -2517,10 +2405,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:HI (sign_extend:HI (match_dup 1))
-                            (zero_extend:HI (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sumulqihi3"
   [(set (match_operand:HI 0 "register_operand"                         "=r")
@@ -2542,10 +2428,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:HI (not:HI (zero_extend:HI (not:QI (match_dup 1))))
-                            (sign_extend:HI (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*osmulqihi3"
   [(set (match_operand:HI 0 "register_operand"                                        "=&r")
@@ -2566,10 +2450,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:HI (not:HI (zero_extend:HI (not:QI (match_dup 1))))
-                            (zero_extend:HI (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*oumulqihi3"
   [(set (match_operand:HI 0 "register_operand"                                        "=&r")
@@ -2596,11 +2478,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:QI (mult:QI (match_dup 1)
-                                     (match_dup 2))
-                            (match_dup 3)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*maddqi4"
   [(set (match_operand:QI 0 "register_operand"                  "=r")
@@ -2622,11 +2501,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (minus:QI (match_dup 3)
-                             (mult:QI (match_dup 1)
-                                      (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*msubqi4"
   [(set (match_operand:QI 0 "register_operand"                   "=r")
@@ -2705,11 +2581,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:HI (mult:HI (any_extend:HI (match_dup 1))
-                                     (any_extend:HI (match_dup 2)))
-                            (match_dup 3)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<extend_u>maddqihi4"
   [(set (match_operand:HI 0 "register_operand"                                  "=r")
@@ -2734,11 +2607,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (minus:HI (match_dup 3)
-                             (mult:HI (any_extend:HI (match_dup 1))
-                                      (any_extend:HI (match_dup 2)))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<extend_u>msubqihi4"
   [(set (match_operand:HI 0 "register_operand"                                  "=r")
@@ -2765,11 +2635,8 @@
    && <any_extend:CODE> != <any_extend2:CODE>"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (plus:HI (mult:HI (any_extend:HI  (match_dup 1))
-                                     (any_extend2:HI (match_dup 2)))
-                            (match_dup 3)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<any_extend:extend_su><any_extend2:extend_su>msubqihi4"
   [(set (match_operand:HI 0 "register_operand"                                  "=r")
@@ -2800,11 +2667,8 @@
    && <any_extend:CODE> != <any_extend2:CODE>"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (minus:HI (match_dup 3)
-                             (mult:HI (any_extend:HI  (match_dup 1))
-                                      (any_extend2:HI (match_dup 2)))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<any_extend:extend_su><any_extend2:extend_su>msubqihi4"
   [(set (match_operand:HI 0 "register_operand"                                   "=r")
@@ -3072,16 +2936,14 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ashift:HI (sign_extend:HI (match_dup 1))
-                              (const_int 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*ashiftqihi2.signx.1"
   [(set (match_operand:HI 0 "register_operand"                           "=r,*r")
         (ashift:HI (sign_extend:HI (match_operand:QI 1 "register_operand" "0,r"))
                    (const_int 1)))
-   (clobber (reg:CC REG_CC)) ]
+   (clobber (reg:CC REG_CC))]
   "reload_completed"
   "@
 	lsl %A0\;sbc %B0,%B0
@@ -3142,6 +3004,41 @@
     operands[2] = gen_int_mode (1 << INTVAL (operands[2]), QImode);
   })
 
+(define_insn_and_split "*sext.ashift<QIPSI:mode><HISI:mode>2_split"
+  [(set (match_operand:HISI 0 "register_operand"                                 "=r")
+        (sign_extend:HISI (ashift:QIPSI (match_operand:QIPSI 1 "register_operand" "0")
+                                        (match_operand:QI 2 "const_int_operand" "PKC03"))))]
+  "<HISI:SIZE> > <QIPSI:SIZE>
+   && IN_RANGE (INTVAL (operands[2]), 1, 2 + (<QIPSI:SIZE> <= 2))"
+  "#"
+  "&& reload_completed"
+  [(scratch)]
+  { DONE_ADD_CCC })
+
+(define_insn "*sext.ashift<QIPSI:mode><HISI:mode>2"
+  [(set (match_operand:HISI 0 "register_operand"                                 "=r")
+        (sign_extend:HISI (ashift:QIPSI (match_operand:QIPSI 1 "register_operand" "0")
+                                        (match_operand:QI 2 "const_int_operand" "PKC03"))))
+   (clobber (reg:CC REG_CC))]
+  "reload_completed
+   && <HISI:SIZE> > <QIPSI:SIZE>
+   && IN_RANGE (INTVAL (operands[2]), 1, 2 + (<QIPSI:SIZE> <= 2))"
+  {
+    const int regno = REGNO (operands[0]);
+    // The shift.
+    for (int s = 0; s < (int) INTVAL (operands[2]); ++s)
+      for (int b = 0; b < <QIPSI:SIZE>; ++b)
+        output_asm_insn (b == 0 ? "lsl %0" : "rol %0",
+                         &all_regs_rtx[regno + b]);
+    // Sign-extend can use carry.
+    for (int b = <QIPSI:SIZE>; b < <HISI:SIZE>; ++b)
+      output_asm_insn ("sbc %0,%0", &all_regs_rtx[regno + b]);
+    return "";
+  }
+  [(set (attr "length")
+        (plus (symbol_ref "<QIPSI:SIZE> * INTVAL (operands[2])")
+              (symbol_ref "<HISI:SIZE> - <QIPSI:SIZE>")))])
+
 ;******************************************************************************
 ; mul HI: $1 = sign-/zero-/one-extend, $2 = reg
 ;******************************************************************************
@@ -3153,10 +3050,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:HI (sign_extend:HI (match_dup 1))
-                            (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulsqihi3"
   [(set (match_operand:HI 0 "register_operand"                        "=&r")
@@ -3178,10 +3073,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:HI (zero_extend:HI (match_dup 1))
-                            (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*muluqihi3"
   [(set (match_operand:HI 0 "register_operand"                        "=&r")
@@ -3205,10 +3098,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:HI (not:HI (zero_extend:HI (not:QI (match_dup 1))))
-                            (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*muloqihi3"
   [(set (match_operand:HI 0 "register_operand"                                        "=&r")
@@ -3277,10 +3168,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:HI (match_dup 1)
-                            (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulhi3_enh"
   [(set (match_operand:HI 0 "register_operand" "=&r")
@@ -3319,11 +3208,8 @@
   "!AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (mult:HI (reg:HI 24) (reg:HI 22)))
-              (clobber (reg:HI 22))
-              (clobber (reg:QI 21))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulhi3_call"
   [(set (reg:HI 24)
@@ -3719,11 +3605,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:SI 22)
-                   (mult:SI (reg:SI 22)
-                            (reg:SI 18)))
-              (clobber (reg:HI 26))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn_and_split "*mulsi3_call_pr118012_split"
   [(set (reg:SI 22)
@@ -3737,13 +3620,8 @@
    && ! AVR_TINY"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:SI 22)
-                   (mult:SI (reg:SI 22)
-                            (reg:SI 18)))
-              (clobber (reg:SI 18))
-              (clobber (reg:HI 26))
-              (clobber (reg:HI 30))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulsi3_call"
   [(set (reg:SI 22)
@@ -3779,10 +3657,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:SI 22)
-                   (mult:SI (any_extend:SI (reg:HI 18))
-                            (any_extend:SI (reg:HI 26))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<extend_u>mulhisi3_call"
   [(set (reg:SI 22)
@@ -3804,12 +3680,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (truncate:HI (lshiftrt:SI (mult:SI (any_extend:SI (reg:HI 18))
-                                                      (any_extend:SI (reg:HI 26)))
-                                             (const_int 16))))
-              (clobber (reg:HI 22))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*<extend_su>mulhi3_highpart_call"
   [(set (reg:HI 24)
@@ -3829,10 +3701,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:SI 22)
-                   (mult:SI (zero_extend:SI (reg:HI 18))
-                            (sign_extend:SI (reg:HI 26))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*usmulhisi3_call"
   [(set (reg:SI 22)
@@ -3850,10 +3720,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:SI 22)
-                   (mult:SI (any_extend:SI (reg:HI 26))
-                            (reg:SI 18)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mul<extend_su>hisi3_call"
   [(set (reg:SI 22)
@@ -3871,10 +3739,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:SI 22)
-                   (mult:SI (not:SI (zero_extend:SI (not:HI (reg:HI 26))))
-                            (reg:SI 18)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulohisi3_call"
   [(set (reg:SI 22)
@@ -3925,11 +3791,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:QI 24) (div:QI (reg:QI 24) (reg:QI 22)))
-              (set (reg:QI 25) (mod:QI (reg:QI 24) (reg:QI 22)))
-              (clobber (reg:QI 22))
-              (clobber (reg:QI 23))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*divmodqi4_call"
   [(set (reg:QI 24) (div:QI (reg:QI 24) (reg:QI 22)))
@@ -3969,10 +3832,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:QI 24) (udiv:QI (reg:QI 24) (reg:QI 22)))
-              (set (reg:QI 25) (umod:QI (reg:QI 24) (reg:QI 22)))
-              (clobber (reg:QI 23))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*udivmodqi4_call"
   [(set (reg:QI 24) (udiv:QI (reg:QI 24) (reg:QI 22)))
@@ -4013,11 +3874,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 22) (div:HI (reg:HI 24) (reg:HI 22)))
-              (set (reg:HI 24) (mod:HI (reg:HI 24) (reg:HI 22)))
-              (clobber (reg:HI 26))
-              (clobber (reg:QI 21))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*divmodhi4_call"
   [(set (reg:HI 22) (div:HI (reg:HI 24) (reg:HI 22)))
@@ -4059,11 +3917,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 22) (udiv:HI (reg:HI 24) (reg:HI 22)))
-              (set (reg:HI 24) (umod:HI (reg:HI 24) (reg:HI 22)))
-              (clobber (reg:HI 26))
-              (clobber (reg:QI 21))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*udivmodhi4_call"
   [(set (reg:HI 22) (udiv:HI (reg:HI 24) (reg:HI 22)))
@@ -4112,10 +3967,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (mult:PSI (zero_extend:PSI (match_dup 1))
-                             (zero_extend:PSI (match_dup 2))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*umulqihipsi3"
   [(set (match_operand:PSI 0 "register_operand"                         "=&r")
@@ -4134,31 +3987,17 @@
 
 (define_insn_and_split "*umulhiqipsi3_split"
   [(set (match_operand:PSI 0 "register_operand"                         "=&r")
-        (mult:PSI (zero_extend:PSI (match_operand:HI 2 "register_operand" "r"))
-                  (zero_extend:PSI (match_operand:QI 1 "register_operand" "r"))))]
+        (mult:PSI (zero_extend:PSI (match_operand:HI 1 "register_operand" "r"))
+                  (zero_extend:PSI (match_operand:QI 2 "register_operand" "r"))))]
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
+  ; "*umulqihipsi3"
   [(parallel [(set (match_dup 0)
                    (mult:PSI (zero_extend:PSI (match_dup 2))
                              (zero_extend:PSI (match_dup 1))))
               (clobber (reg:CC REG_CC))])])
 
-(define_insn "*umulhiqipsi3"
-  [(set (match_operand:PSI 0 "register_operand"                         "=&r")
-        (mult:PSI (zero_extend:PSI (match_operand:HI 2 "register_operand" "r"))
-                  (zero_extend:PSI (match_operand:QI 1 "register_operand" "r"))))
-   (clobber (reg:CC REG_CC))]
-  "AVR_HAVE_MUL && reload_completed"
-  "mul %1,%A2
-	movw %A0,r0
-	mul %1,%B2
-	add %B0,r0
-	mov %C0,r1
-	clr __zero_reg__
-	adc %C0,__zero_reg__"
-  [(set_attr "length" "7")])
-
 (define_expand "mulsqipsi3"
   [(parallel [(set (match_operand:PSI 0 "pseudo_register_operand" "")
                    (mult:PSI (sign_extend:PSI (match_operand:QI 1 "pseudo_register_operand" ""))
@@ -4229,10 +4068,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:PSI 18)
-                   (mult:PSI (sign_extend:PSI (reg:QI 25))
-                             (reg:PSI 22)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulsqipsi3.libgcc"
   [(set (reg:PSI 18)
@@ -4253,13 +4090,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:PSI 22)
-                   (mult:PSI (reg:PSI 22)
-                             (reg:PSI 18)))
-              (clobber (reg:QI 21))
-              (clobber (reg:QI 25))
-              (clobber (reg:HI 26))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*mulpsi3.libgcc"
   [(set (reg:PSI 22)
@@ -4311,12 +4143,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:PSI 22) (div:PSI (reg:PSI 22) (reg:PSI 18)))
-              (set (reg:PSI 18) (mod:PSI (reg:PSI 22) (reg:PSI 18)))
-              (clobber (reg:QI 21))
-              (clobber (reg:QI 25))
-              (clobber (reg:QI 26))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*divmodpsi4_call"
   [(set (reg:PSI 22) (div:PSI (reg:PSI 22) (reg:PSI 18)))
@@ -4360,12 +4188,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:PSI 22) (udiv:PSI (reg:PSI 22) (reg:PSI 18)))
-              (set (reg:PSI 18) (umod:PSI (reg:PSI 22) (reg:PSI 18)))
-              (clobber (reg:QI 21))
-              (clobber (reg:QI 25))
-              (clobber (reg:QI 26))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*udivmodpsi4_call"
   [(set (reg:PSI 22) (udiv:PSI (reg:PSI 22) (reg:PSI 18)))
@@ -4411,11 +4235,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:SI 18) (div:SI (reg:SI 22) (reg:SI 18)))
-              (set (reg:SI 22) (mod:SI (reg:SI 22) (reg:SI 18)))
-              (clobber (reg:HI 26))
-              (clobber (reg:HI 30))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*divmodsi4_call"
   [(set (reg:SI 18) (div:SI (reg:SI 22) (reg:SI 18)))
@@ -4458,11 +4279,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:SI 18) (udiv:SI (reg:SI 22) (reg:SI 18)))
-              (set (reg:SI 22) (umod:SI (reg:SI 22) (reg:SI 18)))
-              (clobber (reg:HI 26))
-              (clobber (reg:HI 30))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*udivmodsi4_call"
   [(set (reg:SI 18) (udiv:SI (reg:SI 22) (reg:SI 18)))
@@ -4484,10 +4302,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (and:QI (match_dup 1)
-                           (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*andqi3"
   [(set (match_operand:QI 0 "register_operand"       "=??r,d,*l ,r")
@@ -4511,11 +4327,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (and:HI (match_dup 1)
-                           (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*andhi3"
   [(set (match_operand:HI 0 "register_operand"       "=??r,d,d,r  ,r  ,r")
@@ -4545,11 +4358,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (and:PSI (match_dup 1)
-                            (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*andpsi3"
   [(set (match_operand:PSI 0 "register_operand"        "=??r,d,r  ,r  ,r")
@@ -4580,11 +4390,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (and:SI (match_dup 1)
-                           (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*andsi3"
   [(set (match_operand:SI 0 "register_operand"       "=??r,d,r  ,r  ,r")
@@ -4634,10 +4441,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ior:QI (match_dup 1)
-                           (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*iorqi3"
   [(set (match_operand:QI 0 "register_operand"       "=??r,d,*l")
@@ -4659,11 +4464,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ior:HI (match_dup 1)
-                           (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*iorhi3"
   [(set (match_operand:HI 0 "register_operand"       "=??r,d,d,r  ,r")
@@ -4691,11 +4493,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ior:PSI (match_dup 1)
-                            (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*iorpsi3"
   [(set (match_operand:PSI 0 "register_operand"        "=??r,d,r  ,r")
@@ -4723,11 +4522,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ior:SI (match_dup 1)
-                           (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*iorsi3"
   [(set (match_operand:SI 0 "register_operand"       "=??r,d,r  ,r")
@@ -4758,10 +4554,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (xor:QI (match_dup 1)
-                           (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*xorqi3"
   [(set (match_operand:QI 0 "register_operand" "=r")
@@ -4780,11 +4574,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (xor:HI (match_dup 1)
-                           (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*xorhi3"
   [(set (match_operand:HI 0 "register_operand"       "=??r,r  ,d  ,r")
@@ -4810,11 +4601,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (xor:PSI (match_dup 1)
-                            (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*xorpsi3"
   [(set (match_operand:PSI 0 "register_operand"        "=??r,r  ,d  ,r")
@@ -4842,11 +4630,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (xor:SI (match_dup 1)
-                           (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*xorsi3"
   [(set (match_operand:SI 0 "register_operand"       "=??r,r  ,d  ,r")
@@ -4918,7 +4703,7 @@
               (clobber (reg:CC REG_CC))])]
   "optimize
    && reload_completed"
-  [(const_int 1)]
+  [(scratch)]
   {
     for (int i = 0; i < <SIZE>; i++)
       {
@@ -5026,10 +4811,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (rotate:QI (match_dup 1)
-                              (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*rotlqi3"
   [(set (match_operand:QI 0 "register_operand"               "=r,r,r  ,r  ,r  ,r  ,r  ,r")
@@ -5099,10 +4882,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (rotate:HI (match_dup 1)
-                              (const_int 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*rotlhi2.1"
   [(set (match_operand:HI 0 "register_operand"           "=r")
@@ -5120,10 +4901,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (rotate:HI (match_dup 1)
-                              (const_int 15)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*rotlhi2.15"
   [(set (match_operand:HI 0 "register_operand"           "=r")
@@ -5141,10 +4920,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (rotate:PSI (match_dup 1)
-                               (const_int 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*rotlpsi2.1"
   [(set (match_operand:PSI 0 "register_operand"            "=r")
@@ -5162,10 +4939,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (rotate:PSI (match_dup 1)
-                               (const_int 23)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*rotlpsi2.23"
   [(set (match_operand:PSI 0 "register_operand"            "=r")
@@ -5183,10 +4958,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (rotate:SI (match_dup 1)
-                              (const_int 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*rotlsi2.1"
   [(set (match_operand:SI 0 "register_operand"           "=r")
@@ -5204,10 +4977,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (rotate:SI (match_dup 1)
-                              (const_int 31)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*rotlsi2.31"
   [(set (match_operand:SI 0 "register_operand"           "=r")
@@ -5239,7 +5010,7 @@
    && 0 == INTVAL (operands[2]) % 16"
   "#"
   "&& reload_completed"
-  [(const_int 0)]
+  [(scratch)]
   {
     avr_rotate_bytes (operands);
     DONE;
@@ -5263,7 +5034,7 @@
            && 0 == INTVAL (operands[2]) % 16))"
   "#"
   "&& reload_completed"
-  [(const_int 0)]
+  [(scratch)]
   {
     avr_rotate_bytes (operands);
     DONE;
@@ -5273,41 +5044,6 @@
 ;;<< << << << << << << << << << << << << << << << << << << << << << << << << <<
 ;; arithmetic shift left
 
-;; Work around PR120423: Transform left shift of a paradoxical subreg
-;; into left shift of the zero-extended entity.
-(define_split ; PR120423
-  [(set (match_operand:HISI 0 "register_operand")
-        (ashift:HISI (subreg:HISI (match_operand:QIPSI 1 "nonimmediate_operand")
-                                  0)
-                     (match_operand:QI 2 "const_int_operand")))]
-  "!reload_completed
-   && !avropt_lra_p
-   && <HISI:SIZE> > <QIPSI:SIZE>"
-  [(set (match_dup 4)
-        (zero_extend:HISI (match_dup 5)))
-   (set (match_dup 0)
-        (ashift:HISI (match_dup 4)
-                     (match_dup 2)))]
-  {
-    operands[4] = gen_reg_rtx (<HISI:MODE>mode);
-    operands[5] = force_reg (<QIPSI:MODE>mode, operands[1]);
-  })
-
-;; Similar happens for PR116389.
-(define_split ; PR116389
-  [(set (match_operand:HISI 0 "register_operand")
-        (subreg:HISI (match_operand:QIPSI 1 "nonimmediate_operand")
-                     0))]
-  "!reload_completed
-   && !avropt_lra_p
-   && <HISI:SIZE> > <QIPSI:SIZE>"
-  [(set (match_dup 0)
-        (zero_extend:HISI (match_dup 2)))]
-  {
-    operands[2] = force_reg (<QIPSI:MODE>mode, operands[1]);
-  })
-
-
 ;; "ashlqi3"
 ;; "ashlqq3"  "ashluqq3"
 (define_expand "ashl<mode>3"
@@ -5363,10 +5099,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ashift:ALL1 (match_dup 1)
-                                (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*ashl<mode>3"
   [(set (match_operand:ALL1 0 "register_operand"              "=r,r  ,r      ,r,r")
@@ -5390,11 +5124,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ashift:ALL2 (match_dup 1)
-                                (match_dup 2)))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "*,*,*,3op,*,*")])
 
 ;; "*ashlhi3"
@@ -5506,11 +5237,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ashift:ALL4 (match_dup 1)
-                                (match_dup 2)))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "*,*,*,3op,*,*")])
 
 (define_insn "*ashl<mode>3"
@@ -5749,12 +5477,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ashift:PSI (match_dup 1)
-                               (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "*,*,*,3op,*")])
 
 (define_insn "*ashlpsi3"
@@ -5808,10 +5532,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ashiftrt:ALL1 (match_dup 1)
-                                  (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*ashr<mode>3"
   [(set (match_operand:ALL1 0 "register_operand"                  "=r,r              ,r      ,r")
@@ -5835,11 +5557,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ashiftrt:ALL2 (match_dup 1)
-                                  (match_dup 2)))
-              (clobber (reg:CC REG_CC))])]
-   ""
+  [(scratch)]
+  { DONE_ADD_CCC }
    [(set_attr "isa" "*,*,*,3op,*,*")])
 
 ;; "*ashrhi3"
@@ -5866,12 +5585,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ashiftrt:PSI (match_dup 1)
-                                 (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "*,*,*,3op,*")])
 
 (define_insn "*ashrpsi3"
@@ -5898,11 +5613,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (ashiftrt:ALL4 (match_dup 1)
-                                  (match_dup 2)))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "*,*,*,3op,*,*")])
 
 (define_insn "*ashr<mode>3"
@@ -6013,10 +5725,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (lshiftrt:ALL1 (match_dup 1)
-                                  (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*lshr<mode>3"
   [(set (match_operand:ALL1 0 "register_operand"                  "=r,r  ,r      ,r,r")
@@ -6039,11 +5749,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (lshiftrt:ALL2 (match_dup 1)
-                                  (match_dup 2)))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "*,*,*,3op,*,*")])
 
 (define_insn "*lshr<mode>3"
@@ -6066,12 +5773,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (lshiftrt:PSI (match_dup 1)
-                                 (match_dup 2)))
-              (clobber (match_dup 3))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "*,*,*,3op,*")])
 
 (define_insn "*lshrpsi3"
@@ -6098,11 +5801,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (lshiftrt:ALL4 (match_dup 1)
-                                  (match_dup 2)))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "*,*,*,3op,*,*")])
 
 (define_insn "*lshr<mode>3"
@@ -6217,9 +5917,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (abs:QI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*absqi2"
   [(set (match_operand:QI 0 "register_operand" "=r")
@@ -6237,9 +5936,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (abs:SF (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*abssf2"
   [(set (match_operand:SF 0 "register_operand" "=d,r")
@@ -6260,9 +5958,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (neg:QI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*negqi2"
   [(set (match_operand:QI 0 "register_operand" "=r")
@@ -6278,9 +5975,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (neg:HI (sign_extend:HI (match_dup 1))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*negqihi2"
   [(set (match_operand:HI 0 "register_operand"                        "=r")
@@ -6296,9 +5992,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (neg:HI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*neghi2"
   [(set (match_operand:HI 0 "register_operand"        "=r,&r")
@@ -6316,9 +6011,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (neg:PSI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*negpsi2"
   [(set (match_operand:PSI 0 "register_operand"        "=!d,r,&r")
@@ -6337,10 +6031,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (neg:SI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "*,*,mov,movw")])
 
 (define_insn "*negsi2.libgcc"
@@ -6371,9 +6063,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (neg:SF (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*negsf2"
   [(set (match_operand:SF 0 "register_operand" "=d,r")
@@ -6394,9 +6085,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (not:QI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*one_cmplqi2"
   [(set (match_operand:QI 0 "register_operand" "=r")
@@ -6412,9 +6102,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (not:HI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*one_cmplhi2"
   [(set (match_operand:HI 0 "register_operand" "=r")
@@ -6431,9 +6120,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (not:PSI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*one_cmplpsi2"
   [(set (match_operand:PSI 0 "register_operand" "=r")
@@ -6449,9 +6137,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (not:SI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*one_cmplsi2"
   [(set (match_operand:SI 0 "register_operand" "=r")
@@ -6480,9 +6167,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (sign_extend:HI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*extendqihi2"
   [(set (match_operand:HI 0 "register_operand" "=r,r")
@@ -6501,9 +6187,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (sign_extend:PSI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*extendqipsi2"
   [(set (match_operand:PSI 0 "register_operand" "=r,r")
@@ -6522,9 +6207,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (sign_extend:SI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*extendqisi2"
   [(set (match_operand:SI 0 "register_operand" "=r,r")
@@ -6543,9 +6227,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (sign_extend:PSI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*extendhipsi2"
   [(set (match_operand:PSI 0 "register_operand"                               "=r,r")
@@ -6564,9 +6247,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (sign_extend:SI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*extendhisi2"
   [(set (match_operand:SI 0 "register_operand"                               "=r,r")
@@ -6585,9 +6267,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (sign_extend:SI (match_dup 1)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*extendpsisi2"
   [(set (match_operand:SI 0 "register_operand"                                "=r")
@@ -7032,10 +6713,11 @@
    "#"
    "reload_completed"
    [(set (reg:CC REG_CC)
-         (compare:CC (match_dup 1) (match_dup 2)))
+         (compare:CC (match_dup 1)
+                     (match_dup 2)))
     (set (pc)
-         (if_then_else (match_op_dup 0
-                         [(reg:CC REG_CC) (const_int 0)])
+         (if_then_else (match_op_dup 0 [(reg:CC REG_CC)
+                                        (const_int 0)])
                        (label_ref (match_dup 3))
                        (pc)))])
 
@@ -7054,11 +6736,12 @@
    "#"
    "reload_completed"
    [(parallel [(set (reg:CC REG_CC)
-                    (compare:CC (match_dup 1) (match_dup 2)))
+                    (compare:CC (match_dup 1)
+                                (match_dup 2)))
                (clobber (match_dup 4))])
     (set (pc)
-         (if_then_else (match_op_dup 0
-                         [(reg:CC REG_CC) (const_int 0)])
+         (if_then_else (match_op_dup 0 [(reg:CC REG_CC)
+                                        (const_int 0)])
                        (label_ref (match_dup 3))
                        (pc)))]
    {
@@ -7081,11 +6764,12 @@
    "#"
    "reload_completed"
    [(parallel [(set (reg:CC REG_CC)
-                    (compare:CC (match_dup 1) (match_dup 2)))
+                    (compare:CC (match_dup 1)
+                                (match_dup 2)))
                (clobber (match_dup 4))])
     (set (pc)
-         (if_then_else (match_op_dup 0
-                         [(reg:CC REG_CC) (const_int 0)])
+         (if_then_else (match_op_dup 0 [(reg:CC REG_CC)
+                                        (const_int 0)])
                        (label_ref (match_dup 3))
                        (pc)))]
    {
@@ -7109,11 +6793,12 @@
    "#"
    "reload_completed"
    [(parallel [(set (reg:CC REG_CC)
-                    (compare:CC (match_dup 1) (match_dup 2)))
+                    (compare:CC (match_dup 1)
+                                (match_dup 2)))
                (clobber (match_dup 4))])
     (set (pc)
-         (if_then_else (match_op_dup 0
-                         [(reg:CC REG_CC) (const_int 0)])
+         (if_then_else (match_op_dup 0 [(reg:CC REG_CC)
+                                        (const_int 0)])
                        (label_ref (match_dup 3))
                        (pc)))]
    {
@@ -7668,17 +7353,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (pc)
-                   (if_then_else
-                    (match_op_dup 0
-                                  [(zero_extract:QIDI
-                                    (match_dup 1)
-                                    (const_int 1)
-                                    (match_dup 2))
-                                   (const_int 0)])
-                    (label_ref (match_dup 3))
-                    (pc)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sbrx_branch<mode>"
   [(set (pc)
@@ -7721,13 +7397,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (pc)
-                   (if_then_else (match_op_dup 0 [(and:QISI (match_dup 1)
-                                                            (match_dup 2))
-                                                  (const_int 0)])
-                                 (label_ref (match_dup 3))
-                                 (pc)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sbrx_and_branch<mode>"
   [(set (pc)
@@ -7968,14 +7639,8 @@
   "!AVR_HAVE_EIJMP_EICALL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (pc)
-                   (unspec:HI [(match_dup 0)]
-                              UNSPEC_INDEX_JMP))
-              (use (label_ref (match_dup 1)))
-              (clobber (match_dup 2))
-              (clobber (const_int 0))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "rjmp,rjmp,jmp")])
 
 (define_insn "*tablejump"
@@ -8004,14 +7669,8 @@
   "AVR_HAVE_EIJMP_EICALL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (pc)
-                   (unspec:HI [(reg:HI REG_Z)]
-                              UNSPEC_INDEX_JMP))
-              (use (label_ref (match_dup 0)))
-              (clobber (reg:HI REG_Z))
-              (clobber (reg:QI 24))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "eijmp")])
 
 
@@ -8182,17 +7841,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (pc)
-                   (if_then_else
-                    (match_operator 0 "eqne_operator"
-                                    [(zero_extract:QIHI
-                                      (mem:QI (match_dup 1))
-                                      (const_int 1)
-                                      (match_dup 2))
-                                     (const_int 0)])
-                    (label_ref (match_dup 3))
-                    (pc)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sbix_branch"
   [(set (pc)
@@ -8230,14 +7880,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (pc)
-                   (if_then_else
-                    (match_operator 0 "gelt_operator"
-                                    [(mem:QI (match_dup 1))
-                                     (const_int 0)])
-                    (label_ref (match_dup 2))
-                    (pc)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sbix_branch_bit7"
   [(set (pc)
@@ -8277,17 +7921,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (pc)
-                   (if_then_else
-                    (match_operator 0 "eqne_operator"
-                                    [(zero_extract:QIHI
-                                      (mem:QI (match_dup 1))
-                                      (const_int 1)
-                                      (match_dup 2))
-                                     (const_int 0)])
-                    (label_ref (match_dup 3))
-                    (pc)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sbix_branch_tmp"
   [(set (pc)
@@ -8324,14 +7959,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (pc)
-                   (if_then_else
-                    (match_operator 0 "gelt_operator"
-                                    [(mem:QI (match_dup 1))
-                                     (const_int 0)])
-                    (label_ref (match_dup 2))
-                    (pc)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sbix_branch_tmp_bit7"
   [(set (pc)
@@ -8784,13 +8413,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(unspec_volatile [(match_dup 0)
-                                (const_int 1)]
-                               UNSPECV_DELAY_CYCLES)
-              (set (match_dup 1)
-                   (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER))
-              (clobber (match_dup 2))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*delay_cycles_1"
   [(unspec_volatile [(match_operand:QI 0 "const_int_operand" "n")
@@ -8816,14 +8440,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(unspec_volatile [(match_dup 0)
-                                (const_int 2)]
-                               UNSPECV_DELAY_CYCLES)
-              (set (match_dup 1)
-                   (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER))
-              (clobber (match_dup 2))
-              (clobber (reg:CC REG_CC))])]
-  ""
+  [(scratch)]
+  { DONE_ADD_CCC }
   [(set_attr "isa" "adiw,no_adiw")])
 
 (define_insn "*delay_cycles_2"
@@ -8853,15 +8471,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(unspec_volatile [(match_dup 0)
-                                (const_int 3)]
-                               UNSPECV_DELAY_CYCLES)
-              (set (match_dup 1)
-                   (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER))
-              (clobber (match_dup 2))
-              (clobber (match_dup 3))
-              (clobber (match_dup 4))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*delay_cycles_3"
   [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "n")
@@ -8896,16 +8507,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(unspec_volatile [(match_dup 0)
-                                (const_int 4)]
-                               UNSPECV_DELAY_CYCLES)
-              (set (match_dup 1)
-                   (unspec_volatile:BLK [(match_dup 1)] UNSPECV_MEMORY_BARRIER))
-              (clobber (match_dup 2))
-              (clobber (match_dup 3))
-              (clobber (match_dup 4))
-              (clobber (match_dup 5))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*delay_cycles_4"
   [(unspec_volatile [(match_operand:SI 0 "const_int_operand" "n")
@@ -8942,12 +8545,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (unspec:QI [(match_dup 1)
-                               (match_dup 2)
-                               (match_dup 3)]
-                              UNSPEC_INSERT_BITS))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*insert_bits"
   [(set (match_operand:QI 0 "register_operand"              "=r  ,d  ,r")
@@ -9127,12 +8726,13 @@
    "#"
    "reload_completed"
    [(set (reg:CC REG_CC)
-         (compare:CC (match_dup 0) (const_int 0)))
+         (compare:CC (match_dup 0)
+                     (const_int 0)))
     (set (pc)
-         (if_then_else (ge (reg:CC REG_CC) (const_int 0))
+         (if_then_else (ge (reg:CC REG_CC)
+                           (const_int 0))
                        (label_ref (match_dup 1))
-                       (pc)))]
-   "")
+                       (pc)))])
 
 (define_expand "flash_segment"
   [(parallel [(match_operand:QI 0 "register_operand" "")
@@ -9235,9 +8835,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (parity:HI (reg:HI 24)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*parityhi2.libgcc"
   [(set (reg:HI 24)
@@ -9253,9 +8852,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (zero_extend:HI (parity:QI (reg:QI 24))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*parityqihi2.libgcc"
   [(set (reg:HI 24)
@@ -9271,9 +8869,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (truncate:HI (parity:SI (reg:SI 22))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*paritysihi2.libgcc"
   [(set (reg:HI 24)
@@ -9329,9 +8926,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (popcount:HI (reg:HI 24)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*popcounthi2.libgcc"
   [(set (reg:HI 24)
@@ -9347,9 +8943,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (truncate:HI (popcount:SI (reg:SI 22))))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*popcountsi2.libgcc"
   [(set (reg:HI 24)
@@ -9365,9 +8960,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:QI 24)
-                   (popcount:QI (reg:QI 24)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*popcountqi2.libgcc"
   [(set (reg:QI 24)
@@ -9421,10 +9015,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (clz:HI (reg:HI 24)))
-              (clobber (reg:QI 26))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*clzhi2.libgcc"
   [(set (reg:HI 24)
@@ -9442,10 +9034,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (truncate:HI (clz:SI (reg:SI 22))))
-              (clobber (reg:QI 26))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*clzsihi2.libgcc"
   [(set (reg:HI 24)
@@ -9490,10 +9080,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (ctz:HI (reg:HI 24)))
-              (clobber (reg:QI 26))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*ctzhi2.libgcc"
   [(set (reg:HI 24)
@@ -9512,11 +9100,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (truncate:HI (ctz:SI (reg:SI 22))))
-              (clobber (reg:QI 22))
-              (clobber (reg:QI 26))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*ctzsihi2.libgcc"
   [(set (reg:HI 24)
@@ -9562,10 +9147,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (ffs:HI (reg:HI 24)))
-              (clobber (reg:QI 26))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*ffshi2.libgcc"
   [(set (reg:HI 24)
@@ -9584,11 +9167,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 24)
-                   (truncate:HI (ffs:SI (reg:SI 22))))
-              (clobber (reg:QI 22))
-              (clobber (reg:QI 26))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*ffssihi2.libgcc"
   [(set (reg:HI 24)
@@ -9633,9 +9213,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:SI 22)
-                   (bswap:SI (reg:SI 22)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*bswapsi2.libgcc"
   [(set (reg:SI 22)
@@ -9742,11 +9321,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (unspec:HI [(match_dup 1)
-                               (match_dup 2)]
-                              UNSPEC_FMUL))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*fmul_insn"
   [(set (match_operand:HI 0 "register_operand" "=r")
@@ -9768,11 +9344,8 @@
   "!AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 22)
-                   (unspec:HI [(reg:QI 24)
-                               (reg:QI 25)] UNSPEC_FMUL))
-              (clobber (reg:HI 24))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*fmul.call"
   [(set (reg:HI 22)
@@ -9814,11 +9387,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (unspec:HI [(match_dup 1)
-                               (match_dup 2)]
-                              UNSPEC_FMULS))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*fmuls_insn"
   [(set (match_operand:HI 0 "register_operand" "=r")
@@ -9840,11 +9410,8 @@
   "!AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 22)
-                   (unspec:HI [(reg:QI 24)
-                               (reg:QI 25)] UNSPEC_FMULS))
-              (clobber (reg:HI 24))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*fmuls.call"
   [(set (reg:HI 22)
@@ -9886,11 +9453,8 @@
   "AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (unspec:HI [(match_dup 1)
-                               (match_dup 2)]
-                              UNSPEC_FMULSU))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*fmulsu_insn"
   [(set (match_operand:HI 0 "register_operand" "=r")
@@ -9912,11 +9476,8 @@
   "!AVR_HAVE_MUL"
   "#"
   "&& reload_completed"
-  [(parallel [(set (reg:HI 22)
-                   (unspec:HI [(reg:QI 24)
-                               (reg:QI 25)] UNSPEC_FMULSU))
-              (clobber (reg:HI 24))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*fmulsu.call"
   [(set (reg:HI 22)
@@ -10037,11 +9598,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (zero_extract:QI (match_dup 0)
-                                    (const_int 1)
-                                    (match_dup 1))
-                   (match_dup 2))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*insv.reg"
   [(set (zero_extract:QI (match_operand:QI 0 "register_operand"    "+r,d,d,l,l")
@@ -10478,11 +10036,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (zero_extract:QI (not:QI (match_dup 1))
-                                    (const_int 1)
-                                    (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*extzv.not"
   [(set (match_operand:QI 0 "register_operand" "=r")
@@ -10619,11 +10174,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (and:QISI (any_shift:QISI (match_dup 1)
-                                             (match_dup 2))
-                             (match_dup 3)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*insv.any_shift.<mode>"
   [(set (match_operand:QISI 0 "register_operand" "=r")
@@ -10686,11 +10238,8 @@
   ""
   "#"
   "&& reload_completed"
-  [(parallel [(set (match_dup 0)
-                   (sign_extract:QISI (match_dup 1)
-                                      (const_int 1)
-                                      (match_dup 2)))
-              (clobber (reg:CC REG_CC))])])
+  [(scratch)]
+  { DONE_ADD_CCC })
 
 (define_insn "*sextr.<QISI:mode>.<QISI2:mode>"
   [(set (match_operand:QISI 0 "register_operand"                    "=r")
diff --git a/gcc/config/avr/avr.opt b/gcc/config/avr/avr.opt
index 9883119..2bed8ea 100644
--- a/gcc/config/avr/avr.opt
+++ b/gcc/config/avr/avr.opt
@@ -18,10 +18,6 @@
 ; along with GCC; see the file COPYING3.  If not see
 ; <http://www.gnu.org/licenses/>.
 
-mlra
-Target Var(avropt_lra_p) UInteger Init(1) Optimization Undocumented
-Usa LRA for reload instead of the old reload framework.  This option is experimental, on per default, and it may be removed in future versions of the compiler.
-
 mcall-prologues
 Target Mask(CALL_PROLOGUES) Optimization
 Optimization. Use subroutines for function prologues and epilogues.
@@ -164,6 +160,10 @@ mfuse-move=
 Target Joined RejectNegative UInteger Var(avropt_fuse_move) Init(0) Optimization IntegerRange(0, 23)
 -mfuse-move=<0,23>	Optimization. Run a post-reload pass that tweaks move instructions.
 
+mfuse-move2
+Target Var(avropt_fuse_move2) Init(0) Optimization
+Optimization. Fuse some move insns after insn combine.
+
 mabsdata
 Target Mask(ABSDATA)
 Assume that all data in static storage can be accessed by LDS / STS instructions.  This option is only useful for reduced Tiny devices like ATtiny40.
diff --git a/gcc/config/avr/avr.opt.urls b/gcc/config/avr/avr.opt.urls
index 662fdee..fa560bc 100644
--- a/gcc/config/avr/avr.opt.urls
+++ b/gcc/config/avr/avr.opt.urls
@@ -1,7 +1,5 @@
 ; Autogenerated by regenerate-opt-urls.py from gcc/config/avr/avr.opt and generated HTML
 
-; skipping UrlSuffix for 'mlra' due to finding no URLs
-
 mcall-prologues
 UrlSuffix(gcc/AVR-Options.html#index-mcall-prologues)
 
@@ -92,6 +90,9 @@ UrlSuffix(gcc/AVR-Options.html#index-mfuse-move)
 mfuse-move=
 UrlSuffix(gcc/AVR-Options.html#index-mfuse-move)
 
+mfuse-move2
+UrlSuffix(gcc/AVR-Options.html#index-mfuse-move2)
+
 mabsdata
 UrlSuffix(gcc/AVR-Options.html#index-mabsdata)
 
diff --git a/gcc/config/avr/specs.h b/gcc/config/avr/specs.h
index ff269bf..c95c758 100644
--- a/gcc/config/avr/specs.h
+++ b/gcc/config/avr/specs.h
@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3.  If not see
   "%(asm_errata_skip) "
 
 #define LINK_RELAX_SPEC                         \
-  "%{mrelax:--relax} "
+  "%{!r:%{mrelax:--relax}} "
 
 #undef  LINK_SPEC
 #define LINK_SPEC                               \
diff --git a/gcc/config/cris/cris.h b/gcc/config/cris/cris.h
index 1681c79..f356679 100644
--- a/gcc/config/cris/cris.h
+++ b/gcc/config/cris/cris.h
@@ -171,7 +171,7 @@ extern int cris_cpu_version;
 
 /* For the cris-*-elf subtarget.  */
 #define CRIS_ASM_SUBTARGET_SPEC \
- "--em=criself %{!march=*:%{!mcpu=*:" CRIS_DEFAULT_ASM_ARCH_OPTION "}}"
+ "--emulation=criself %{!march=*:%{!mcpu=*:" CRIS_DEFAULT_ASM_ARCH_OPTION "}}"
 
 /* FIXME: We should propagate the -melf option to make the criself
    "emulation" unless a linker script is provided (-T*), but I don't know
diff --git a/gcc/config/darwin-sections.def b/gcc/config/darwin-sections.def
index 44adcc6..76587c2 100644
--- a/gcc/config/darwin-sections.def
+++ b/gcc/config/darwin-sections.def
@@ -215,3 +215,10 @@ DEF_SECTION (objc2_method_names_section, 0,
 
 DEF_SECTION (objc2_method_types_section, 0,
 	     ".section __TEXT, __objc_methtype, cstring_literals", 1)
+
+/* ASAN sections.  */
+
+DEF_SECTION (asan_string_section, 0, ".section __TEXT, __asan_cstring", 0)
+DEF_SECTION (asan_globals_section, 0, ".section __DATA, __asan_globals", 0)
+DEF_SECTION (asan_liveness_section, 0,
+	     ".section __DATA,__asan_liveness,regular,live_support", 0)
diff --git a/gcc/config/darwin.cc b/gcc/config/darwin.cc
index be2daed..75ac356 100644
--- a/gcc/config/darwin.cc
+++ b/gcc/config/darwin.cc
@@ -49,6 +49,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "optabs.h"
 #include "flags.h"
 #include "opts.h"
+#include "asan.h"
 
 /* Fix and Continue.
 
@@ -1298,6 +1299,39 @@ darwin_encode_section_info (tree decl, rtx rtl, int first)
      SYMBOL_FLAG_EXTERNAL.  */
   default_encode_section_info (decl, rtl, first);
 
+  if (CONSTANT_CLASS_P (decl))
+    {
+      bool is_str = TREE_CODE (decl) == STRING_CST;
+      rtx sym_ref = XEXP (rtl, 0);
+
+      /* Unless this is a string cst or we are in an anchored section we have
+	 nothing more to do here.  */
+      if (!is_str && !SYMBOL_REF_HAS_BLOCK_INFO_P (sym_ref))
+	return;
+
+      tree sym_decl = SYMBOL_REF_DECL (sym_ref);
+      const char *name = XSTR (sym_ref, 0);
+      gcc_checking_assert (strncmp ("*lC", name, 3) == 0);
+
+      char *buf;
+      if (is_str)
+	{
+	  bool for_asan = (flag_sanitize & SANITIZE_ADDRESS)
+			   && asan_protect_global (CONST_CAST_TREE (decl));
+	  /* When we are generating code for sanitized strings, the string
+	     internal symbols are made visible in the object.  */
+	  buf = xasprintf ("*%c.str.%s", for_asan ? 'l' : 'L', &name[3]);
+	}
+      else
+	/* Lets identify anchored constants with a different prefix, for the
+	   sake of inspection only.  */
+	buf = xasprintf ("*LaC%s", &name[3]);
+      if (sym_decl)
+	DECL_NAME (sym_decl) = get_identifier (buf);
+      XSTR (sym_ref, 0) = ggc_strdup (buf);
+      free (buf);
+    }
+
   if (! VAR_OR_FUNCTION_DECL_P (decl))
     return;
 
@@ -1683,6 +1717,17 @@ machopic_select_section (tree decl,
 
   ro = TREE_READONLY (decl) || TREE_CONSTANT (decl) ;
 
+  /* Trump categorize_decl_for_section () for ASAN stuff - the Darwin
+     categorisations are special.  */
+  if (flag_sanitize & SANITIZE_ADDRESS)
+    {
+      if (TREE_CODE (decl) == STRING_CST
+	  && asan_protect_global (CONST_CAST_TREE (decl)))
+	{
+	  return darwin_sections[asan_string_section];
+	}
+    }
+
   switch (categorize_decl_for_section (decl, reloc))
     {
     case SECCAT_TEXT:
@@ -1699,7 +1744,12 @@ machopic_select_section (tree decl,
       break;
 
     case SECCAT_RODATA_MERGE_STR_INIT:
-      base_section = darwin_mergeable_string_section (DECL_INITIAL (decl), align);
+      if ((flag_sanitize & SANITIZE_ADDRESS)
+	   && asan_protect_global (CONST_CAST_TREE (decl)))
+	/* or !flag_merge_constants */
+	return darwin_sections[asan_string_section];
+      else
+	return darwin_mergeable_string_section (DECL_INITIAL (decl), align);
       break;
 
     case SECCAT_RODATA_MERGE_CONST:
@@ -3297,11 +3347,16 @@ darwin_use_anchors_for_symbol_p (const_rtx symbol)
 {
   if (DARWIN_SECTION_ANCHORS && flag_section_anchors)
     {
-      section *sect;
-      /* If the section contains a zero-sized object it's ineligible.  */
-      sect = SYMBOL_REF_BLOCK (symbol)->sect;
-      /* This should have the effect of disabling anchors for vars that follow
-         any zero-sized one, in a given section.  */
+      tree decl = SYMBOL_REF_DECL (symbol);
+      /* If the symbol would be linker-visible, then it can split at that
+	 so we must disallow.  This is more strict than the default impl.
+	 TODO: add other cases.  */
+      if (decl && DECL_P (decl)
+	  && (TREE_PUBLIC (decl) || !DECL_ARTIFICIAL (decl)))
+	return false;
+
+      /* We mark sections containing unsuitable entries.  */
+      section *sect = SYMBOL_REF_BLOCK (symbol)->sect;
       if (sect->common.flags & SECTION_NO_ANCHOR)
 	return false;
 
diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h
index 9b9a3fe..c3e28e2 100644
--- a/gcc/config/darwin.h
+++ b/gcc/config/darwin.h
@@ -287,6 +287,19 @@ extern GTY(()) int darwin_ms_struct;
 #define DARWIN_RDYNAMIC "%{rdynamic:%nrdynamic is not supported}"
 #endif
 
+#if LD64_HAS_NO_DEDUPLICATE
+/* What we want is "when the optimization level is debug OR when it is
+   a compile & link job with implied O0 optimization".  */
+#define DARWIN_LD_NO_DEDUPLICATE \
+  "%{O0|O1|O|Og: -no_deduplicate} \
+   %{!O*:\
+     %{.c|.cc|.C|.cpp|.cp|.c++|.cxx|.CPP|.m|.mm|.s|.S|.i|.ii|.mi|.mii|\
+       .f|.for|.ftn|.fpp|.f90|.f95|.f03|.f08|.f77|.F|.F90|.F95|.F03|.F08|\
+       .d|.mod: -no_deduplicate }} "
+#else
+#define DARWIN_LD_NO_DEDUPLICATE ""
+#endif
+
 #if LD64_HAS_MACOS_VERSION_MIN
 # define DARWIN_PLATFORM_ID \
   "%{mmacosx-version-min=*:-macos_version_min %*} "
@@ -403,10 +416,14 @@ extern GTY(()) int darwin_ms_struct;
     %(linker)" \
     DARWIN_LD_DEMANGLE \
     LINK_PLUGIN_SPEC \
+    DARWIN_LD_NO_DEDUPLICATE \
     "%{flto*:%<fcompare-debug*} \
      %{flto} %{fno-lto} %{flto=*} \
-    %l " \
+     %{static}%{!static:%{!dynamic:-dynamic}} \
+     %{force_cpusubtype_ALL:-arch %(darwin_arch)} \
+     %{!force_cpusubtype_ALL:-arch %(darwin_subarch)} "\
     DARWIN_PLATFORM_ID \
+    " %l " \
     LINK_COMPRESS_DEBUG_SPEC \
    "%X %{s} %{t} %{Z} %{u*} \
     %{e*} %{r} \
@@ -493,9 +510,8 @@ extern GTY(()) int darwin_ms_struct;
    Note that options taking arguments may appear multiple times on a command
    line with different arguments each time, so put a * after their names so
    all of them get passed.  */
-#define LINK_SPEC  \
-  "%{static}%{!static:%{!dynamic:-dynamic}} \
-   %:remove-outfile(-ldl) \
+#define LINK_SPEC \
+   "%:remove-outfile(-ldl) \
    %:remove-outfile(-lm) \
    %:remove-outfile(-lpthread) \
    %{fgnu-runtime: %{static|static-libgcc: \
@@ -511,9 +527,7 @@ extern GTY(()) int darwin_ms_struct;
    %{static|static-libgm2:%:replace-outfile(-lm2iso libm2iso.a%s)}\
    %{static|static-libgm2:%:replace-outfile(-lm2min libm2min.a%s)}\
    %{static|static-libgm2:%:replace-outfile(-lm2log libm2log.a%s)}\
-   %{static|static-libgm2:%:replace-outfile(-lm2cor libm2cor.a%s)}\
-  %{force_cpusubtype_ALL:-arch %(darwin_arch)} \
-   %{!force_cpusubtype_ALL:-arch %(darwin_subarch)} "\
+   %{static|static-libgm2:%:replace-outfile(-lm2cor libm2cor.a%s)} "\
    LINK_SYSROOT_SPEC \
    "%{!multiply_defined*:%{shared-libgcc: \
      %:version-compare(< 10.5 mmacosx-version-min= -multiply_defined) \
@@ -1005,6 +1019,8 @@ extern GTY(()) section * darwin_sections[NUM_DARWIN_SECTIONS];
       sprintf (LABEL, "*%s%ld", "lASAN", (long)(NUM));\
     else if (strcmp ("LTRAMP", PREFIX) == 0)	\
       sprintf (LABEL, "*%s%ld", "lTRAMP", (long)(NUM));\
+    else if (strncmp ("LANCHOR", PREFIX, 7) == 0)	\
+      sprintf (LABEL, "*%s%ld", "lANCHOR", (long)(NUM));\
     else						\
       sprintf (LABEL, "*%s%ld", PREFIX, (long)(NUM));	\
   } while (0)
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index fe68678..0287400 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -92,6 +92,8 @@ enum hsaco_attr_type
 /* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag
    for non-scalar memory operations. The string starts on purpose with a space.
    Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used.
+   Note: on atomics, glc/sc0 denotes whether the pre-op operation should
+   be used.
    CDNA3 also uses 'nt' instead of 'slc' and 'sc1' instead of 'scc'; however,
    there is no non-scalar user so far.  */
 #define TARGET_GLC_NAME (TARGET_CDNA3 ? " sc0" : " glc")
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 0994329..a34d2e3 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -3938,6 +3938,7 @@
    v_cmpx%E1\t%2, %3
    v_cmpx%E1\t%2, %3"
   [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+   (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
    (set_attr "length" "4,8,4,8,8,8,4,8")
    (set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
 
@@ -3992,6 +3993,7 @@
    v_cmpx%E1\t%2, %3
    v_cmpx%E1\t%2, %3"
   [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+   (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
    (set_attr "length" "4,8,4,8,8,8,4,8")
    (set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
 
@@ -4050,6 +4052,7 @@
    v_cmpx%E1\t%2, %3
    v_cmpx%E1\t%2, %3"
   [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+   (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
    (set_attr "length" "4,8,4,8,8,4,8")
    (set_attr "rdna" "*,*,no,no,*,yes,yes")])
 
@@ -4073,6 +4076,7 @@
    v_cmpx%E1\t%2, %3
    v_cmpx%E1\t%2, %3"
   [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+   (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
    (set_attr "length" "4,8,4,8,8,4,8")
    (set_attr "rdna" "*,*,no,no,*,yes,yes")])
 
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 8959118..5ffeb23 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -54,6 +54,7 @@
 #include "gimple.h"
 #include "cgraph.h"
 #include "case-cfn-macros.h"
+#include "opts.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -183,6 +184,11 @@ gcn_option_override (void)
 
   if (flag_sram_ecc == HSACO_ATTR_DEFAULT)
     flag_sram_ecc = gcn_devices[gcn_arch].sramecc_default;
+
+  /* TODO: This seems to produce tighter loops, but the testsuites expects it
+     to be set to '2', so I'll leave it default for now.
+  SET_OPTION_IF_UNSET (&global_options, &global_options_set,
+		       param_vect_partial_vector_usage, 1);  */
 }
 
 /* }}}  */
@@ -5789,45 +5795,19 @@ gcn_libc_has_function (enum function_class fn_class,
   return bsd_libc_has_function (fn_class, type);
 }
 
-/* }}}  */
-/* {{{ md_reorg pass.  */
-
-/* Identify V_CMPX from the "type" attribute;
-   note: this will also match 'v_cmp %E1 vcc'.  */
+/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */
 
 static bool
-gcn_cmpx_insn_p (attr_type type)
+gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode),
+			   int ARG_UNUSED (scale),
+			   unsigned int ARG_UNUSED (group_size))
 {
-  switch (type)
-    {
-    case TYPE_VOPC:
-      return true;
-    case TYPE_MUBUF:
-    case TYPE_MTBUF:
-    case TYPE_FLAT:
-    case TYPE_VOP3P_MAI:
-    case TYPE_UNKNOWN:
-    case TYPE_SOP1:
-    case TYPE_SOP2:
-    case TYPE_SOPK:
-    case TYPE_SOPC:
-    case TYPE_SOPP:
-    case TYPE_SMEM:
-    case TYPE_DS:
-    case TYPE_VOP2:
-    case TYPE_VOP1:
-    case TYPE_VOP3A:
-    case TYPE_VOP3B:
-    case TYPE_VOP_SDWA:
-    case TYPE_VOP_DPP:
-    case TYPE_MULT:
-    case TYPE_VMULT:
-      return false;
-    }
-  gcc_unreachable ();
-  return false;
+  return true;
 }
 
+/* }}}  */
+/* {{{ md_reorg pass.  */
+
 /* Identify VMEM instructions from their "type" attribute.  */
 
 static bool
@@ -6356,19 +6336,59 @@ gcn_md_reorg (void)
 		   reg_class_contents[(int)VCC_CONDITIONAL_REG])))
 	    nops_rqd = ivccwait - prev_insn->age;
 
+	  /* NOTE: The following condition for adding wait state exists, but
+	     GCC does not access the special registers using their SGPR#.
+	     Thus, no action is required here.  The following wait-state
+	     condition exists at least for VEGA/gfx900+ to CDNA3:
+		Mixed use of VCC: alias vs. SGPR# - v_readlane,
+		v_readfirstlane, v_cmp, v_add_*i/u, v_sub_*i/u, v_div_*scale
+		followed by VALU reads VCC as constant requires 1 wait state.
+		(As carry-in, it requires none.)
+		[VCC can be accessed by name or logical SGPR that holds it.]  */
+
+	  /* Testing indicates that CDNA3 requires an s_nop between
+	     e.g. 'v_cmp_eq_u64 vcc, v[4:5], v[8:9]' and 'v_mov_b32 v0, vcc_lo'.
+	     Thus: add it between v_cmp writing VCC and VALU read of VCC.  */
+	  if (TARGET_CDNA3_NOPS
+	      && (prev_insn->age + nops_rqd) < 1
+	      && iunit == UNIT_VECTOR
+	      && (hard_reg_set_intersect_p
+		  (depregs, reg_class_contents[(int)VCC_CONDITIONAL_REG]))
+	      && get_attr_vcmp (prev_insn->insn) == VCMP_VCMP)
+	    nops_rqd = 1 - prev_insn->age;
+
+	  /* CDNA3: VALU writes SGPR/VCC: v_readlane, v_readfirstlane, v_cmp,
+	     v_add_*i/u, v_sub_*i/u, v_div_*scale - followed by:
+	     - VALU reads SGPR as constant requires 1 waite state
+	     - VALU reads SGPR as carry-in requires no waite state
+	     - v_readlane/v_writelane reads SGPR as lane select requires 4 wait
+	       states.  */
+	  if (TARGET_CDNA3_NOPS
+	      && (prev_insn->age + nops_rqd) < 4
+	      && iunit == UNIT_VECTOR
+	      && prev_insn->unit == UNIT_VECTOR
+	      && hard_reg_set_intersect_p
+		   (depregs, reg_class_contents[(int) SGPR_SRC_REGS]))
+	    {
+	      if (get_attr_laneselect (insn) != LANESELECT_NO)
+		nops_rqd = 4 - prev_insn->age;
+	      else if ((prev_insn->age + nops_rqd) < 1)
+		nops_rqd = 1 - prev_insn->age;
+	    }
+
 	  /* CDNA3: v_cmpx followed by
 	     - V_readlane, v_readfirstlane, v_writelane requires 4 wait states
 	     - VALU reads EXEC as constant requires 2 wait states
 	     - other VALU requires no wait state  */
 	  if (TARGET_CDNA3_NOPS
 	      && (prev_insn->age + nops_rqd) < 4
-	      && gcn_cmpx_insn_p (prev_insn->type)
+	      && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
 	      && get_attr_laneselect (insn) != LANESELECT_NO)
 	    nops_rqd = 4 - prev_insn->age;
 	  else if (TARGET_CDNA3_NOPS
 		   && (prev_insn->age + nops_rqd) < 2
 		   && iunit == UNIT_VECTOR
-		   && gcn_cmpx_insn_p (prev_insn->type)
+		   && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
 		   && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
 	    nops_rqd = 2 - prev_insn->age;
 
@@ -6436,8 +6456,8 @@ gcn_md_reorg (void)
 	}
 
       /* Insert the required number of NOPs.  */
-      for (int i = nops_rqd; i > 0; i--)
-	emit_insn_after (gen_nop (), last_insn);
+      if (nops_rqd > 0)
+	emit_insn_after (gen_nops (GEN_INT (nops_rqd-1)), last_insn);
 
       /* Age the previous instructions.  We can also ignore writes to
          registers subsequently overwritten.  */
@@ -7283,6 +7303,11 @@ print_operand_address (FILE *file, rtx mem)
    H - print second part of a multi-reg value (high-part of 2-reg value)
    J - print third part of a multi-reg value
    K - print fourth part of a multi-reg value
+   R   Print a scalar register number as an integer.  Temporary hack.
+   V - Print a vector register number as an integer.  Temporary hack.
+
+   Additionally, the standard builtin c, n, a, and l exist; see gccint's
+   "Output Templates and Operand Substitution" for details.
  */
 
 void
@@ -8131,6 +8156,8 @@ gcn_dwarf_register_span (rtx rtl)
   gcn_vectorize_builtin_vectorized_function
 #undef  TARGET_VECTORIZE_GET_MASK_MODE
 #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
+#undef  TARGET_VECTORIZE_PREFER_GATHER_SCATTER
+#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter
 #undef  TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
 #undef  TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index fad42e6..4130cf6 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -324,6 +324,11 @@
              "store,storex34,load,atomic,atomicwait,cmpswapx2,no"
              (const_string "no"))
 
+; Identify v_cmp and v_cmpx instructions for "Manually Inserted Wait State"
+; handling.
+
+(define_attr "vcmp" "vcmp,vcmpx,no" (const_string "no"))
+
 ; Identify instructions that require "Manually Inserted Wait State" if
 ; a previous instruction writes to VCC.  The number gives the number of NOPs.
 
@@ -424,6 +429,15 @@
   "s_nop\t0x0"
   [(set_attr "type" "sopp")])
 
+; Variant of 'nop' that accepts a count argument.
+; s_nop accepts 0x0 to 0xf for 1 to 16 nops; however,
+; as %0 prints decimals, only 0 to 9 (= 1 to 10 nops) can be used.
+(define_insn "nops"
+  [(match_operand 0 "const_int_operand")]
+  ""
+  "s_nop\t0x%0"
+  [(set_attr "type" "sopp")])
+
 ; FIXME: What should the value of the immediate be? Zero is disallowed, so
 ; pick 1 for now.
 (define_insn "trap"
@@ -566,6 +580,7 @@
   [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat,
 		     flat,flat,flat,flat")
    (set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store")
+   (set_attr "vcmp" "*,*,*,*,vcmp,*,*,*,*,*,*,*,*,*,*")
    (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*")
    (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12")
    (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")
@@ -1089,6 +1104,7 @@
    s_cmp%D1\t%2, %3
    v_cmp%E1\tvcc, %2, %3"
   [(set_attr "type" "sopc,vopc")
+   (set_attr "vcmp" "vcmp")
    (set_attr "length" "8")])
 
 (define_insn "cstoredi4_vector"
@@ -1099,6 +1115,7 @@
   ""
   "v_cmp%E1\tvcc, %2, %3"
   [(set_attr "type" "vopc")
+   (set_attr "vcmp" "vcmp")
    (set_attr "length" "8")])
 
 (define_expand "cbranchdi4"
@@ -1125,6 +1142,7 @@
   ""
   "v_cmp%E1\tvcc, %2, %3"
   [(set_attr "type" "vopc")
+   (set_attr "vcmp" "vcmp")
    (set_attr "length" "8")])
 
 (define_expand "cbranch<mode>4"
@@ -2165,7 +2183,7 @@
 		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;flat_load%o0\t%0, %A1%O1 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\t0\;buffer_inv sc1"
 		    : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
@@ -2177,7 +2195,7 @@
 		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;global_load%o0\t%0, %A1%O1 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
 		    : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
@@ -2224,7 +2242,7 @@
 		    : TARGET_WBINVL1_CACHE
 		    ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_store%o1\t%A0, %1%O0 %G1"
 		    : "error: cache architectire unspecified");
 	  case 2:
 	    return (TARGET_GLn_CACHE
@@ -2232,7 +2250,7 @@
 		    : TARGET_WBINVL1_CACHE
 		    ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_store%o1\t%A0, %1%O0 %G1"
 		    : "error: cache architecture unspecified");
 	  }
 	break;
@@ -2252,7 +2270,8 @@
 		    ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;"
+		      "flat_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\t0\;buffer_inv sc1"
 		    : "error: cache architecture unspecified");
 	  case 2:
@@ -2263,7 +2282,8 @@
 		    ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
+		      "global_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
 		    : "error: cache architecture unspecified");
 	  }
@@ -2347,7 +2367,7 @@
             ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0"
             : "error: cache architecture unspecified");
 	  case 2:
@@ -2360,7 +2380,7 @@
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)"
             : "error: cache architecture unspecified");
@@ -2382,7 +2402,7 @@
             ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0\;buffer_inv sc1"
             : "error: cache architecture unspecified");
 	  case 2:
@@ -2395,7 +2415,7 @@
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
             : "error: cache architecture unspecified");
diff --git a/gcc/config/h8300/addsub.md b/gcc/config/h8300/addsub.md
index 32eba9d..f153625 100644
--- a/gcc/config/h8300/addsub.md
+++ b/gcc/config/h8300/addsub.md
@@ -271,7 +271,7 @@
 			     (match_operand:QHSI 2 "register_operand" "r"))
 		(match_dup 1)))
    (set (match_operand:QHSI 0 "register_operand" "=r")
-	(plus (match_dup 1) (match_dup 2)))
+	(plus:QHSI (match_dup 1) (match_dup 2)))
    (clobber (reg:CC CC_REG))]
   ""
 {
diff --git a/gcc/config/h8300/jumpcall.md b/gcc/config/h8300/jumpcall.md
index 4e63408..44847e4 100644
--- a/gcc/config/h8300/jumpcall.md
+++ b/gcc/config/h8300/jumpcall.md
@@ -156,7 +156,7 @@
   "#"
   "&& reload_completed"
   [(set (reg:CCZ CC_REG)
-	(eq (zero_extract:HSI (match_dup 1) (const_int 1) (match_dup 2))
+	(eq:CCZ (zero_extract:HSI (match_dup 1) (const_int 1) (match_dup 2))
 	    (const_int 0)))
    (set (pc)
 	(if_then_else (match_op_dup 3 [(reg:CCZ CC_REG) (const_int 0)])
@@ -181,7 +181,7 @@
 			   (lshiftrt:SI (match_dup 1) (const_int 16))))
 	      (clobber (reg:CC CC_REG))])
    (set (reg:CCZ CC_REG)
-	(eq (zero_extract:SI (match_dup 4) (const_int 1) (match_dup 2))
+	(eq:CCZ (zero_extract:SI (match_dup 4) (const_int 1) (match_dup 2))
 	    (const_int 0)))
    (set (pc)
 	(if_then_else (match_op_dup 3 [(reg:CCZ CC_REG) (const_int 0)])
@@ -288,7 +288,7 @@
   })
 
 (define_insn "call_insn_<mode>"
-  [(call (mem:QI (match_operand 0 "call_insn_operand" "Cr"))
+  [(call (mem:QI (match_operand:P 0 "call_insn_operand" "Cr"))
 	         (match_operand:P 1 "general_operand" "g"))]
   "!SIBLING_CALL_P (insn)"
 {
@@ -326,7 +326,7 @@
 
 (define_insn "call_value_insn_<mode>"
   [(set (match_operand 0 "" "=r")
-	(call (mem:QI (match_operand 1 "call_insn_operand" "Cr"))
+	(call (mem:QI (match_operand:P 1 "call_insn_operand" "Cr"))
 		      (match_operand:P 2 "general_operand" "g")))]
   "!SIBLING_CALL_P (insn)"
 {
@@ -358,7 +358,7 @@
   })
 
 (define_insn "sibcall_insn_<mode>"
-  [(call (mem:QI (match_operand 0 "call_insn_operand" "Cr"))
+  [(call (mem:QI (match_operand:P 0 "call_insn_operand" "Cr"))
 	         (match_operand:P 1 "general_operand" "g"))]
   "SIBLING_CALL_P (insn)"
 {
@@ -396,7 +396,7 @@
 
 (define_insn "sibcall_value_insn_<mode>"
   [(set (match_operand 0 "" "=r")
-	(call (mem:QI (match_operand 1 "call_insn_operand" "Cr"))
+	(call (mem:QI (match_operand:P 1 "call_insn_operand" "Cr"))
 		      (match_operand:P 2 "general_operand" "g")))]
   "SIBLING_CALL_P (insn)"
 {
diff --git a/gcc/config/h8300/testcompare.md b/gcc/config/h8300/testcompare.md
index 694c9e6..3b43381 100644
--- a/gcc/config/h8300/testcompare.md
+++ b/gcc/config/h8300/testcompare.md
@@ -28,7 +28,7 @@
 ;;
 (define_insn ""
   [(set (reg:CCZ CC_REG)
-	(eq (zero_extract:HSI (match_operand:HSI 0 "register_operand" "r")
+	(eq:CCZ (zero_extract:HSI (match_operand:HSI 0 "register_operand" "r")
 			      (const_int 1)
 			      (match_operand 1 "const_int_operand" "n"))
 	    (const_int 0)))]
@@ -54,7 +54,7 @@
 
 (define_insn "*tsthi_upper"
   [(set (reg:CCZN CC_REG)
-	(compare (and:HI (match_operand:HI 0 "register_operand" "r")
+	(compare:CCZN (and:HI (match_operand:HI 0 "register_operand" "r")
 			 (const_int -256))
 		 (const_int 0)))]
   "reload_completed"
@@ -63,7 +63,7 @@
 
 (define_insn "*tsthi_upper_z"
   [(set (reg:CCZ CC_REG)
-	(compare (and:HI (match_operand:HI 0 "register_operand" "r")
+	(compare:CCZ (and:HI (match_operand:HI 0 "register_operand" "r")
 			 (const_int -256))
 		 (const_int 0)))]
   "reload_completed"
@@ -72,7 +72,7 @@
 
 (define_insn "*tstsi_upper"
   [(set (reg:CCZN CC_REG)
-	(compare (and:SI (match_operand:SI 0 "register_operand" "r")
+	(compare:CCZN (and:SI (match_operand:SI 0 "register_operand" "r")
 			 (const_int -65536))
 		 (const_int 0)))]
   "reload_completed"
@@ -81,7 +81,7 @@
 
 (define_insn "*cmp<mode>_c"
   [(set (reg:CCC CC_REG)
-	(ltu (match_operand:QHSI 0 "h8300_dst_operand" "rQ")
+	(ltu:CCC (match_operand:QHSI 0 "h8300_dst_operand" "rQ")
 	     (match_operand:QHSI 1 "h8300_src_operand" "rQi")))]
   "reload_completed"
   {
@@ -97,7 +97,7 @@
 
 (define_insn "*cmpqi_z"
   [(set (reg:CCZ CC_REG)
-	(eq (match_operand:QI 0 "h8300_dst_operand" "rQ")
+	(eq:CCZ (match_operand:QI 0 "h8300_dst_operand" "rQ")
 	    (match_operand:QI 1 "h8300_src_operand" "rQi")))]
   "reload_completed"
   { return "cmp.b	%X1,%X0"; }
@@ -105,7 +105,7 @@
 
 (define_insn "*cmphi_z"
   [(set (reg:CCZ CC_REG)
-	(eq (match_operand:HI 0 "h8300_dst_operand" "rQ")
+	(eq:CCZ (match_operand:HI 0 "h8300_dst_operand" "rQ")
 	    (match_operand:HI 1 "h8300_src_operand" "rQi")))]
   "reload_completed"
   { return "cmp.w	%T1,%T0"; }
@@ -113,7 +113,7 @@
 
 (define_insn "*cmpsi_z"
   [(set (reg:CCZ CC_REG)
-	(eq (match_operand:SI 0 "h8300_dst_operand" "rQ")
+	(eq:CCZ (match_operand:SI 0 "h8300_dst_operand" "rQ")
 	    (match_operand:SI 1 "h8300_src_operand" "rQi")))]
   "reload_completed"
   { return "cmp.l	%S1,%S0"; }
@@ -121,7 +121,7 @@
 
 (define_insn "*cmpqi"
   [(set (reg:CC CC_REG)
-	(compare (match_operand:QI 0 "h8300_dst_operand" "rQ")
+	(compare:CC (match_operand:QI 0 "h8300_dst_operand" "rQ")
 		 (match_operand:QI 1 "h8300_src_operand" "rQi")))]
   "reload_completed"
   "cmp.b	%X1,%X0"
@@ -129,7 +129,7 @@
 
 (define_insn "*cmphi"
   [(set (reg:CC CC_REG)
-	(compare (match_operand:HI 0 "h8300_dst_operand" "rU,rQ")
+	(compare:CC (match_operand:HI 0 "h8300_dst_operand" "rU,rQ")
 		 (match_operand:HI 1 "h8300_src_operand" "P3>X,rQi")))]
   "reload_completed"
 {
@@ -150,7 +150,7 @@
 
 (define_insn "cmpsi"
   [(set (reg:CC CC_REG)
-	(compare (match_operand:SI 0 "h8300_dst_operand" "r,rQ")
+	(compare:CC (match_operand:SI 0 "h8300_dst_operand" "r,rQ")
 		 (match_operand:SI 1 "h8300_src_operand" "P3>X,rQi")))]
   "reload_completed"
 {
@@ -176,7 +176,7 @@
 (define_peephole2
   [(match_scratch:QHSI 1 "r")
    (set (reg:CC CC_REG)
-	(compare (match_operand:QHSI 0 "memory_operand" "")
+	(compare:CC (match_operand:QHSI 0 "memory_operand" "")
 		 (const_int 0)))]
   "!mode_dependent_address_p (XEXP (operands[0], 0), MEM_ADDR_SPACE (operands[0]))"
   [(parallel [(set (reg:CCZN CC_REG) (compare:CCZN (match_dup 0) (const_int 0)))
@@ -187,7 +187,7 @@
 (define_peephole2
   [(match_scratch:QHSI 1 "r")
    (set (reg:CC CC_REG)
-	(compare (match_operand:QHSI 0 "memory_operand" "")
+	(compare:CC (match_operand:QHSI 0 "memory_operand" "")
 		 (const_int 0)))]
   "mode_dependent_address_p (XEXP (operands[0], 0), MEM_ADDR_SPACE (operands[0]))"
   [(parallel [(set (match_dup 1) (match_dup 0)) (clobber (reg:CC CC_REG))])
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 09aa9b1..3278f1f 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -3151,7 +3151,7 @@ ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
 }
 
 /* Expand floating point op0 <=> op1, i.e.
-   dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2.  */
+   dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128.  */
 
 void
 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
@@ -3264,7 +3264,7 @@ ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
   if (l2)
     {
       emit_label (l2);
-      emit_move_insn (dest, op2 == const0_rtx ? const2_rtx : op2);
+      emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2);
     }
   emit_label (lend);
 }
@@ -8241,8 +8241,10 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem,
       unsigned HOST_WIDE_INT countval = UINTVAL (count);
       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
       unsigned int destalign = MEM_ALIGN (destmem);
+      cfun->machine->by_pieces_in_use = true;
       move_by_pieces (destmem, srcmem, epilogue_size, destalign,
 		      RETURN_BEGIN);
+      cfun->machine->by_pieces_in_use = false;
       return;
     }
   if (max_size > 8)
@@ -8405,8 +8407,8 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
 
 /* Callback routine for store_by_pieces.  Return the RTL of a register
    containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
-   is a word or a word vector register.  If PREV_P isn't nullptr, it
-   has the RTL info from the previous iteration.  */
+   is an integer or a word vector register.  If PREV_P isn't nullptr,
+   it has the RTL info from the previous iteration.  */
 
 static rtx
 setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
@@ -8435,10 +8437,6 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
   rtx op = (rtx) op_p;
   machine_mode op_mode = GET_MODE (op);
 
-  gcc_assert (op_mode == word_mode
-	      || (VECTOR_MODE_P (op_mode)
-		  && GET_MODE_INNER (op_mode) == word_mode));
-
   if (VECTOR_MODE_P (mode))
     {
       gcc_assert (GET_MODE_INNER (mode) == QImode);
@@ -8460,16 +8458,17 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
       return tmp;
     }
 
-  target = gen_reg_rtx (word_mode);
   if (VECTOR_MODE_P (op_mode))
     {
+      gcc_assert (GET_MODE_INNER (op_mode) == word_mode);
+      target = gen_reg_rtx (word_mode);
       op = gen_rtx_SUBREG (word_mode, op, 0);
       emit_move_insn (target, op);
     }
   else
     target = op;
 
-  if (mode == word_mode)
+  if (mode == GET_MODE (target))
     return target;
 
   rtx tmp = gen_reg_rtx (mode);
@@ -8490,9 +8489,11 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
       unsigned HOST_WIDE_INT countval = UINTVAL (count);
       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
       unsigned int destalign = MEM_ALIGN (destmem);
+      cfun->machine->by_pieces_in_use = true;
       store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
 		       vec_value ? vec_value : value, destalign, true,
 		       RETURN_BEGIN);
+      cfun->machine->by_pieces_in_use = false;
       return;
     }
   if (max_size > 32)
@@ -9574,8 +9575,9 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
     case vector_loop:
       need_zero_guard = true;
       unroll_factor = 4;
-      /* Get the vector mode to move MOVE_MAX bytes.  */
-      nunits = MOVE_MAX / GET_MODE_SIZE (word_mode);
+      /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes.  */
+      nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
+      nunits /= GET_MODE_SIZE (word_mode);
       if (nunits > 1)
 	{
 	  move_mode = mode_for_vector (word_mode, nunits).require ();
@@ -27033,6 +27035,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
   return target;
 }
 
+/* GF2P8AFFINEQB matrixes to implement shift and rotate.  */
+
+static const uint64_t matrix_ashift[8] =
+{
+  0,
+  0x0001020408102040, /* 1 l */
+  0x0000010204081020, /* 2 l */
+  0x0000000102040810, /* 3 l */
+  0x0000000001020408, /* 4 l */
+  0x0000000000010204, /* 5 l */
+  0x0000000000000102, /* 6 l */
+  0x0000000000000001  /* 7 l */
+};
+
+static const uint64_t matrix_lshiftrt[8] =
+{
+  0,
+  0x0204081020408000, /* 1 r */
+  0x0408102040800000, /* 2 r */
+  0x0810204080000000, /* 3 r */
+  0x1020408000000000, /* 4 r */
+  0x2040800000000000, /* 5 r */
+  0x4080000000000000, /* 6 r */
+  0x8000000000000000  /* 7 r */
+};
+
+static const uint64_t matrix_ashiftrt[8] =
+{
+  0,
+  0x0204081020408080, /* 1 r */
+  0x0408102040808080, /* 2 r */
+  0x0810204080808080, /* 3 r */
+  0x1020408080808080, /* 4 r */
+  0x2040808080808080, /* 5 r */
+  0x4080808080808080, /* 6 r */
+  0x8080808080808080  /* 7 r */
+};
+
+static const uint64_t matrix_rotate[8] =
+{
+  0,
+  0x8001020408102040, /* 1 rol8 */
+  0x4080010204081020, /* 2 rol8 */
+  0x2040800102040810, /* 3 rol8 */
+  0x1020408001020408, /* 4 rol8 */
+  0x0810204080010204, /* 5 rol8 */
+  0x0408102040800102, /* 6 rol8 */
+  0x0204081020408001  /* 7 rol8 */
+};
+
+static const uint64_t matrix_rotatert[8] =
+{
+  0,
+  0x0204081020408001, /* 1 ror8 */
+  0x0408102040800102, /* 2 ror8 */
+  0x0810204080010204, /* 3 ror8 */
+  0x1020408001020408, /* 4 ror8 */
+  0x2040800102040810, /* 5 ror8 */
+  0x4080010204081020, /* 6 ror8 */
+  0x8001020408102040  /* 7 ror8 */
+};
+
+/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
+   for CODE and shift count COUNT into register with vector of size of SRC.  */
+
+rtx
+ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
+{
+  machine_mode mode = GET_MODE (src);
+  const uint64_t *matrix;
+  unsigned shift = INTVAL (count) & 7;
+  gcc_assert (shift > 0 && shift < 8);
+
+  switch (code)
+    {
+    case ASHIFT:
+      matrix = matrix_ashift;
+      break;
+    case ASHIFTRT:
+      matrix = matrix_ashiftrt;
+      break;
+    case LSHIFTRT:
+      matrix = matrix_lshiftrt;
+      break;
+    case ROTATE:
+      matrix = matrix_rotate;
+      break;
+    case ROTATERT:
+      matrix = matrix_rotatert;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  int nelts = GET_MODE_NUNITS (mode);
+  rtvec vec = rtvec_alloc (nelts);
+  uint64_t ma = matrix[shift];
+  for (int i = 0; i < nelts; i++)
+    RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
+
+  return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
+}
+
 /* Trunc a vector to a narrow vector, like v4di -> v4si.  */
 
 void
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c131577..0608dd2 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3085,21 +3085,68 @@ ix86_rpad_gate ()
 	  && optimize_function_for_speed_p (cfun));
 }
 
+enum x86_cse_kind
+{
+  X86_CSE_CONST0_VECTOR,
+  X86_CSE_CONSTM1_VECTOR,
+  X86_CSE_VEC_DUP,
+  X86_CSE_TLS_GD,
+  X86_CSE_TLS_LD_BASE,
+  X86_CSE_TLSDESC
+};
+
+struct redundant_pattern
+{
+  /* Bitmap of basic blocks with broadcast instructions.  */
+  auto_bitmap bbs;
+  /* Bitmap of broadcast instructions.  */
+  auto_bitmap insns;
+  /* The broadcast inner scalar.  */
+  rtx val;
+  /* The actual redundant source value for UNSPEC_TLSDESC.  */
+  rtx tlsdesc_val;
+  /* The inner scalar mode.  */
+  machine_mode mode;
+  /* The instruction which sets the inner scalar.  Nullptr if the inner
+     scalar is applied to the whole function, instead of within the same
+     block.  */
+  rtx_insn *def_insn;
+  /* The widest broadcast source.  */
+  rtx broadcast_source;
+  /* The widest broadcast register.  */
+  rtx broadcast_reg;
+  /* The basic block of the broadcast instruction.  */
+  basic_block bb;
+  /* The number of broadcast instructions with the same inner scalar.  */
+  unsigned HOST_WIDE_INT count;
+  /* The threshold of broadcast instructions with the same inner
+     scalar.  */
+  unsigned int threshold;
+  /* The widest broadcast size in bytes.  */
+  unsigned int size;
+  /* Load kind.  */
+  x86_cse_kind kind;
+};
+
 /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
    for basic block map BBS, which is in the fake loop that contains the
    whole function, so that there is only a single vector set in the
-   whole function.  If not nullptr, INNER_SCALAR is the inner scalar of
-   SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)).  */
+   whole function.  If not nullptr, LOAD is a pointer to the load.  */
 
 static void
 ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
-			      rtx inner_scalar = nullptr)
+			      redundant_pattern *load = nullptr)
 {
   basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
-  while (bb->loop_father->latch
-	 != EXIT_BLOCK_PTR_FOR_FN (cfun))
-    bb = get_immediate_dominator (CDI_DOMINATORS,
-				  bb->loop_father->header);
+  /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop
+     to avoid extra spills.  */
+  if (!load || load->kind != X86_CSE_VEC_DUP)
+    {
+      while (bb->loop_father->latch
+	     != EXIT_BLOCK_PTR_FOR_FN (cfun))
+	bb = get_immediate_dominator (CDI_DOMINATORS,
+				      bb->loop_father->header);
+    }
 
   rtx set = gen_rtx_SET (dest, src);
 
@@ -3141,8 +3188,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
 	}
     }
 
-  if (inner_scalar)
+  if (load && load->kind == X86_CSE_VEC_DUP)
     {
+      /* Get the source from LOAD as (reg:SI 99) in
+
+	 (vec_duplicate:V4SI (reg:SI 99))
+
+       */
+      rtx inner_scalar = load->val;
       /* Set the source in (vec_duplicate:V4SI (reg:SI 99)).  */
       rtx reg = XEXP (src, 0);
       if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
@@ -3226,7 +3279,7 @@ remove_partial_avx_dependency (void)
 	      break;
 	    }
 
-	  /* Only hanlde conversion here.  */
+	  /* Only handle conversion here.  */
 	  machine_mode src_mode
 	    = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
 	  switch (src_mode)
@@ -3489,44 +3542,6 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
     }
 }
 
-enum x86_cse_kind
-{
-  X86_CSE_CONST0_VECTOR,
-  X86_CSE_CONSTM1_VECTOR,
-  X86_CSE_VEC_DUP
-};
-
-struct redundant_load
-{
-  /* Bitmap of basic blocks with broadcast instructions.  */
-  auto_bitmap bbs;
-  /* Bitmap of broadcast instructions.  */
-  auto_bitmap insns;
-  /* The broadcast inner scalar.  */
-  rtx val;
-  /* The inner scalar mode.  */
-  machine_mode mode;
-  /* The instruction which sets the inner scalar.  Nullptr if the inner
-     scalar is applied to the whole function, instead of within the same
-     block.  */
-  rtx_insn *def_insn;
-  /* The widest broadcast source.  */
-  rtx broadcast_source;
-  /* The widest broadcast register.  */
-  rtx broadcast_reg;
-  /* The basic block of the broadcast instruction.  */
-  basic_block bb;
-  /* The number of broadcast instructions with the same inner scalar.  */
-  unsigned HOST_WIDE_INT count;
-  /* The threshold of broadcast instructions with the same inner
-     scalar.  */
-  unsigned int threshold;
-  /* The widest broadcast size in bytes.  */
-  unsigned int size;
-  /* Load kind.  */
-  x86_cse_kind kind;
-};
-
 /* Return the inner scalar if OP is a broadcast, else return nullptr.  */
 
 static rtx
@@ -3629,6 +3644,8 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
 	 Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
 	 integer constant.  */
       op = src;
+      if (mode != GET_MODE (reg))
+	op = gen_int_mode (INTVAL (src), mode);
       *insn_p = nullptr;
     }
   else
@@ -3669,25 +3686,719 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
   return op;
 }
 
-/* At entry of the nearest common dominator for basic blocks with vector
-   CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
-   vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
-   uses.
+/* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and
+   put the updated instruction in UPDATED_TLS_INSNS.  */
 
-   NB: We want to generate only a single widest vector set to cover the
-   whole function.  The LCM algorithm isn't appropriate here since it
-   may place a vector set inside the loop.  */
+static void
+replace_tls_call (rtx src, auto_bitmap &tls_call_insns,
+		  auto_bitmap &updated_tls_insns)
+{
+  bitmap_iterator bi;
+  unsigned int id;
 
-static unsigned int
-remove_redundant_vector_load (void)
+  EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
+    {
+      rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
+
+      /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
+	 allowed.  */
+      if (!CALL_P (insn))
+	{
+	  attr_tls64 tls64 = get_attr_tls64 (insn);
+	  if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
+	    gcc_unreachable ();
+	}
+
+      rtx pat = PATTERN (insn);
+      gcc_assert (GET_CODE (pat) == PARALLEL);
+      rtx set = XVECEXP (pat, 0, 0);
+      gcc_assert (GET_CODE (set) == SET);
+      rtx dest = SET_DEST (set);
+
+      set = gen_rtx_SET (dest, src);
+      rtx_insn *set_insn = emit_insn_after (set, insn);
+      if (recog_memoized (set_insn) < 0)
+	gcc_unreachable ();
+
+      /* Put SET_INSN in UPDATED_TLS_INSNS.  */
+      bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn));
+
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nReplace:\n\n");
+	  print_rtl_single (dump_file, insn);
+	  fprintf (dump_file, "\nwith:\n\n");
+	  print_rtl_single (dump_file, set_insn);
+	  fprintf (dump_file, "\n");
+	}
+
+      /* Delete the CALL insn.  */
+      delete_insn (insn);
+
+      df_insn_rescan (set_insn);
+    }
+}
+
+/* Return the basic block which dominates all basic blocks which set
+   hard register REGNO used in basic block BB.  */
+
+static basic_block
+ix86_get_dominator_for_reg (unsigned int regno, basic_block bb)
+{
+  basic_block set_bb;
+  auto_bitmap set_bbs;
+
+  /* Get all BBs which set REGNO and dominate the current BB from all
+     DEFs of REGNO.  */
+  for (df_ref def = DF_REG_DEF_CHAIN (regno);
+       def;
+       def = DF_REF_NEXT_REG (def))
+    if (!DF_REF_IS_ARTIFICIAL (def)
+	&& !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER)
+	&& !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER))
+      {
+	set_bb = DF_REF_BB (def);
+	if (dominated_by_p (CDI_DOMINATORS, bb, set_bb))
+	  bitmap_set_bit (set_bbs, set_bb->index);
+      }
+
+  bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
+  return bb;
+}
+
+/* Mark FLAGS register as live in DATA, a bitmap of live caller-saved
+   registers, if DEST is FLAGS register.  */
+
+static void
+ix86_check_flags_reg (rtx dest, const_rtx, void *data)
+{
+  auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data;
+  if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
+    bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG);
+}
+
+/* Emit a TLS_SET instruction of KIND in basic block BB.   Store the
+   insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P
+   for emit_insn_after.  UPDATED_GNU_TLS_INSNS contains instructions
+   which replace the GNU TLS instructions.  UPDATED_GNU2_TLS_INSNS
+   contains instructions which replace the GNU2 TLS instructions.  */
+
+static rtx_insn *
+ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
+		    rtx_insn **before_p, rtx_insn **after_p,
+		    auto_bitmap &updated_gnu_tls_insns,
+		    auto_bitmap &updated_gnu2_tls_insns)
+{
+  rtx_insn *tls_insn;
+
+  do
+    {
+      rtx_insn *insn = BB_HEAD (bb);
+      while (insn && !NONDEBUG_INSN_P (insn))
+	{
+	  if (insn == BB_END (bb))
+	    {
+	      /* This must be the beginning basic block:
+
+		 (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
+
+		 or a basic block with only a label:
+
+		 (code_label 78 11 77 3 14 (nil) [1 uses])
+		 (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
+
+		 or a basic block with only a debug marker:
+
+		 (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
+		 (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+
+	       */
+	      gcc_assert (DEBUG_INSN_P (insn)
+			  || (NOTE_P (insn)
+			      && ((NOTE_KIND (insn)
+				   == NOTE_INSN_FUNCTION_BEG)
+				  || (NOTE_KIND (insn)
+				      == NOTE_INSN_BASIC_BLOCK))));
+	      insn = NULL;
+	      break;
+	    }
+	  insn = NEXT_INSN (insn);
+	}
+
+      /* TLS_GD and TLS_LD_BASE instructions are normal functions which
+	 clobber caller-saved registers.  TLSDESC instructions only
+	 clobber FLAGS.  If any registers clobbered by TLS instructions
+	 are live in this basic block, we must insert TLS instructions
+	 after all live registers clobbered are dead.  */
+
+      auto_bitmap live_caller_saved_regs;
+      bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
+
+      if (bitmap_bit_p (in, FLAGS_REG))
+	bitmap_set_bit (live_caller_saved_regs, FLAGS_REG);
+
+      unsigned int i;
+
+      /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE
+	 instructions.  */
+      if (kind != X86_CSE_TLSDESC)
+	for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+	  if (call_used_regs[i]
+	      && !fixed_regs[i]
+	      && bitmap_bit_p (in, i))
+	    bitmap_set_bit (live_caller_saved_regs, i);
+
+      if (bitmap_empty_p (live_caller_saved_regs))
+	{
+	  if (insn == BB_HEAD (bb))
+	    {
+	      *before_p = insn;
+	      tls_insn = emit_insn_before (tls_set, insn);
+	    }
+	  else
+	    {
+	      /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the
+		 beginning basic block:
+
+		 (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
+
+		 or after NOTE_INSN_BASIC_BLOCK in a basic block with
+		 only a label:
+
+		 (code_label 78 11 77 3 14 (nil) [1 uses])
+		 (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
+
+		 or after debug marker in a basic block with only a
+		 debug marker:
+
+		 (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
+		 (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+
+	       */
+	      insn = insn ? PREV_INSN (insn) : BB_END (bb);
+	      *after_p = insn;
+	      tls_insn = emit_insn_after (tls_set, insn);
+	    }
+	  return tls_insn;
+	}
+
+      bool repeat = false;
+
+      /* Search for REG_DEAD notes in this basic block.  */
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+
+	  /* NB: Conditional jump is the only instruction which reads
+	     flags register and changes control flow.  We can never
+	     place the TLS call after unconditional jump.  */
+	  if (JUMP_P (insn))
+	    {
+	      /* This must be a conditional jump.  */
+	      rtx label = JUMP_LABEL (insn);
+	      if (label == nullptr
+		  || ANY_RETURN_P (label)
+		  || !(LABEL_P (label) || SYMBOL_REF_P (label)))
+		gcc_unreachable ();
+
+	      /* Place the call before all FLAGS_REG setting BBs since
+		 we can't place a call before nor after a conditional
+		 jump.  */
+	      bb = ix86_get_dominator_for_reg (FLAGS_REG, bb);
+
+	      /* Start over again.  */
+	      repeat = true;
+	      break;
+	    }
+
+	  if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn)))
+	    {
+	      /* Insert the __tls_get_addr call before INSN which
+		 replaces a __tls_get_addr call.  */
+	      *before_p = insn;
+	      tls_insn = emit_insn_before (tls_set, insn);
+	      return tls_insn;
+	    }
+
+	  if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn)))
+	    {
+	      /* Mark FLAGS register as dead since FLAGS register
+		 would be clobbered by the GNU2 TLS instruction.  */
+	      bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG);
+	      continue;
+	    }
+
+	  /* Check if FLAGS register is live.  */
+	  note_stores (insn, ix86_check_flags_reg,
+		       &live_caller_saved_regs);
+
+	  rtx link;
+	  for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
+	    if (REG_NOTE_KIND (link) == REG_DEAD
+		&& REG_P (XEXP (link, 0)))
+	      {
+		/* Mark the live caller-saved register as dead.  */
+		for (i = REGNO (XEXP (link, 0));
+		     i < END_REGNO (XEXP (link, 0));
+		     i++)
+		  if (i < FIRST_PSEUDO_REGISTER)
+		    bitmap_clear_bit (live_caller_saved_regs, i);
+
+		if (bitmap_empty_p (live_caller_saved_regs))
+		  {
+		    *after_p = insn;
+		    tls_insn = emit_insn_after (tls_set, insn);
+		    return tls_insn;
+		  }
+	      }
+	}
+
+      /* NB: Start over again for conditional jump.  */
+      if (repeat)
+	continue;
+
+      gcc_assert (!bitmap_empty_p (live_caller_saved_regs));
+
+      /* If any live caller-saved registers aren't dead at the end of
+	 this basic block, get the basic block which dominates all
+	 basic blocks which set the remaining live registers.  */
+      auto_bitmap set_bbs;
+      bitmap_iterator bi;
+      unsigned int id;
+      EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi)
+	{
+	  basic_block set_bb = ix86_get_dominator_for_reg (id, bb);
+	  bitmap_set_bit (set_bbs, set_bb->index);
+	}
+      bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
+    }
+  while (true);
+}
+
+/* Generate a TLS call of KIND with VAL and copy the call result to DEST,
+   at entry of the nearest dominator for basic block map BBS, which is in
+   the fake loop that contains the whole function, so that there is only
+   a single TLS CALL of KIND with VAL in the whole function.
+   UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS
+   instructions.  UPDATED_GNU2_TLS_INSNS contains instructions which
+   replace the GNU2 TLS instructions.  If TLSDESC_SET isn't nullptr,
+   insert it before the TLS call.  */
+
+static void
+ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
+			    auto_bitmap &bbs,
+			    auto_bitmap &updated_gnu_tls_insns,
+			    auto_bitmap &updated_gnu2_tls_insns,
+			    rtx tlsdesc_set = nullptr)
+{
+  basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
+  while (bb->loop_father->latch
+	 != EXIT_BLOCK_PTR_FOR_FN (cfun))
+    bb = get_immediate_dominator (CDI_DOMINATORS,
+				  bb->loop_father->header);
+
+  rtx rax = nullptr, rdi;
+  rtx eqv = nullptr;
+  rtx caddr;
+  rtx set;
+  rtx clob;
+  rtx symbol;
+  rtx tls;
+
+  switch (kind)
+    {
+    case X86_CSE_TLS_GD:
+      rax = gen_rtx_REG (Pmode, AX_REG);
+      rdi = gen_rtx_REG (Pmode, DI_REG);
+      caddr = ix86_tls_get_addr ();
+
+      symbol = XVECEXP (val, 0, 0);
+      tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
+
+      if (GET_MODE (symbol) != Pmode)
+	symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
+      eqv = symbol;
+      break;
+
+    case X86_CSE_TLS_LD_BASE:
+      rax = gen_rtx_REG (Pmode, AX_REG);
+      rdi = gen_rtx_REG (Pmode, DI_REG);
+      caddr = ix86_tls_get_addr ();
+
+      tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
+
+      /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
+	 to share the LD_BASE result with other LD model accesses.  */
+      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+			    UNSPEC_TLS_LD_BASE);
+
+      break;
+
+    case X86_CSE_TLSDESC:
+      set = gen_rtx_SET (dest, val);
+      clob = gen_rtx_CLOBBER (VOIDmode,
+			      gen_rtx_REG (CCmode, FLAGS_REG));
+      tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Emit the TLS CALL insn.  */
+  rtx_insn *before = nullptr;
+  rtx_insn *after = nullptr;
+  rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before,
+					   &after,
+					   updated_gnu_tls_insns,
+					   updated_gnu2_tls_insns);
+
+  rtx_insn *tlsdesc_insn = nullptr;
+  if (tlsdesc_set)
+    {
+      rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
+      rtx src = copy_rtx (SET_SRC (tlsdesc_set));
+      tlsdesc_set = gen_rtx_SET (dest, src);
+      tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
+    }
+
+  if (kind != X86_CSE_TLSDESC)
+    {
+      RTL_CONST_CALL_P (tls_insn) = 1;
+
+      /* Indicate that this function can't jump to non-local gotos.  */
+      make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
+    }
+
+  if (recog_memoized (tls_insn) < 0)
+    gcc_unreachable ();
+
+  if (dump_file)
+    {
+      if (after)
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  if (tlsdesc_insn)
+	    print_rtl_single (dump_file, tlsdesc_insn);
+	  print_rtl_single (dump_file, tls_insn);
+	  fprintf (dump_file, "\nafter:\n\n");
+	  print_rtl_single (dump_file, after);
+	  fprintf (dump_file, "\n");
+	}
+      else
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  if (tlsdesc_insn)
+	    print_rtl_single (dump_file, tlsdesc_insn);
+	  print_rtl_single (dump_file, tls_insn);
+	  fprintf (dump_file, "\nbefore:\n\n");
+	  print_rtl_single (dump_file, before);
+	  fprintf (dump_file, "\n");
+	}
+    }
+
+  if (kind != X86_CSE_TLSDESC)
+    {
+      /* Copy RAX to DEST.  */
+      set = gen_rtx_SET (dest, rax);
+      rtx_insn *set_insn = emit_insn_after (set, tls_insn);
+      set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  print_rtl_single (dump_file, set_insn);
+	  fprintf (dump_file, "\nafter:\n\n");
+	  print_rtl_single (dump_file, tls_insn);
+	  fprintf (dump_file, "\n");
+	}
+    }
+}
+
+namespace {
+
+const pass_data pass_data_x86_cse =
+{
+  RTL_PASS, /* type */
+  "x86_cse", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_x86_cse : public rtl_opt_pass
+{
+public:
+  pass_x86_cse (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_x86_cse, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *fun) final override
+    {
+      return (TARGET_SSE2
+	      && optimize
+	      && optimize_function_for_speed_p (fun));
+    }
+
+  unsigned int execute (function *) final override
+    {
+      return x86_cse ();
+    }
+
+private:
+  /* The redundant source value.  */
+  rtx val;
+  /* The actual redundant source value for UNSPEC_TLSDESC.  */
+  rtx tlsdesc_val;
+  /* The instruction which defines the redundant value.  */
+  rtx_insn *def_insn;
+  /* Mode of the destination of the candidate redundant instruction.  */
+  machine_mode mode;
+  /* Mode of the source of the candidate redundant instruction.  */
+  machine_mode scalar_mode;
+  /* The classification of the candidate redundant instruction.  */
+  x86_cse_kind kind;
+
+  unsigned int x86_cse (void);
+  bool candidate_gnu_tls_p (rtx_insn *, attr_tls64);
+  bool candidate_gnu2_tls_p (rtx, attr_tls64);
+  bool candidate_vector_p (rtx);
+  rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx);
+}; // class pass_x86_cse
+
+/* Return the instruction which sets REG from TLS_SYMBOL.  */
+
+rtx_insn *
+pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg,
+					const_rtx tls_symbol)
+{
+  rtx_insn *set_insn = nullptr;
+  for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
+       ref;
+       ref = DF_REF_NEXT_REG (ref))
+    {
+      if (DF_REF_IS_ARTIFICIAL (ref))
+	return nullptr;
+
+      set_insn = DF_REF_INSN (ref);
+      if (get_attr_tls64 (set_insn) != TLS64_LEA)
+	return nullptr;
+
+      rtx tls_set = PATTERN (set_insn);
+      rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0);
+      if (!rtx_equal_p (tls_symbol, tls_src))
+	return nullptr;
+    }
+
+  return set_insn;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+   INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE.  */
+
+bool
+pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64)
+{
+  if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
+    return false;
+
+  /* Record the redundant TLS CALLs for 64-bit:
+
+     (parallel [
+	(set (reg:DI 0 ax)
+	     (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
+		      (const_int 0 [0])))
+	(unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
+		    (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
+	(clobber (reg:DI 5 di))])
+
+
+     and
+
+     (parallel [
+	(set (reg:DI 0 ax)
+	     (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
+		      (const_int 0 [0])))
+	(unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
+
+   */
+
+  rtx pat = PATTERN (insn);
+  rtx set = XVECEXP (pat, 0, 0);
+  gcc_assert (GET_CODE (set) == SET);
+  rtx dest = SET_DEST (set);
+  scalar_mode = mode = GET_MODE (dest);
+  val = XVECEXP (pat, 0, 1);
+  gcc_assert (GET_CODE (val) == UNSPEC);
+
+  if (tls64 == TLS64_GD)
+    kind = X86_CSE_TLS_GD;
+  else
+    kind = X86_CSE_TLS_LD_BASE;
+
+  def_insn = nullptr;
+  return true;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+   SET is UNSPEC_TLSDESC.  */
+
+bool
+pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64)
+{
+  if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
+    return false;
+
+  rtx tls_symbol;
+  rtx_insn *set_insn;
+  rtx src = SET_SRC (set);
+  val = src;
+  tlsdesc_val = src;
+  kind = X86_CSE_TLSDESC;
+
+  if (tls64 == TLS64_COMBINE)
+    {
+      /* Record 64-bit TLS64_COMBINE:
+
+	 (set (reg/f:DI 104)
+	      (plus:DI (unspec:DI [
+			  (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+			  (reg:DI 114)
+			  (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
+		       (const:DI (unspec:DI [
+				    (symbol_ref:DI ("e") [flags 0x1a])
+				  ] UNSPEC_DTPOFF))))
+
+	 (set (reg/f:DI 104)
+	      (plus:DI (unspec:DI [
+			  (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+			  (unspec:DI [
+			     (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+			  ] UNSPEC_TLSDESC)
+			  (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
+		       (const:DI (unspec:DI [
+				    (symbol_ref:DI ("e") [flags 0x1a])
+				 ] UNSPEC_DTPOFF))))
+     */
+
+      scalar_mode = mode = GET_MODE (src);
+
+      /* Since the first operand of PLUS in the source TLS_COMBINE
+	 pattern is unused, use the second operand of PLUS:
+
+	 (const:DI (unspec:DI [
+		      (symbol_ref:DI ("e") [flags 0x1a])
+		   ] UNSPEC_DTPOFF))
+
+	 as VAL to check if 2 TLS_COMBINE patterns have the same
+	 source.  */
+      val = XEXP (src, 1);
+      gcc_assert (GET_CODE (val) == CONST
+		  && GET_CODE (XEXP (val, 0)) == UNSPEC
+		      && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF
+		      && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0)));
+      def_insn = nullptr;
+      return true;
+    }
+
+  /* Record 64-bit TLS_CALL:
+
+     (set (reg:DI 101)
+	  (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
+		      (reg:DI 112)
+		      (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
+
+   */
+
+  gcc_assert (GET_CODE (src) == UNSPEC);
+  tls_symbol = XVECEXP (src, 0, 0);
+  src = XVECEXP (src, 0, 1);
+  scalar_mode = mode = GET_MODE (src);
+  gcc_assert (REG_P (src));
+
+  /* All definitions of reg:DI 129 in
+
+     (set (reg:DI 110)
+	  (unspec:DI [(symbol_ref:DI ("foo"))
+		      (reg:DI 129)
+		      (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
+
+     should have the same source as in
+
+     (set (reg:DI 129)
+	  (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
+
+   */
+
+  set_insn = tls_set_insn_from_symbol (src, tls_symbol);
+  if (!set_insn)
+    return false;
+
+  /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source.  */
+  val = tls_symbol;
+  def_insn = set_insn;
+  return true;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+  INSN is a vector broadcast instruction.  */
+
+bool
+pass_x86_cse::candidate_vector_p (rtx set)
+{
+  rtx src = SET_SRC (set);
+  rtx dest = SET_DEST (set);
+  mode = GET_MODE (dest);
+  /* Skip non-vector instruction.  */
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  /* Skip non-vector load instruction.  */
+  if (!REG_P (dest) && !SUBREG_P (dest))
+    return false;
+
+  val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
+			      &def_insn);
+  return val ? true : false;
+}
+
+/* At entry of the nearest common dominator for basic blocks with
+
+   1. Vector CONST0_RTX patterns.
+   2. Vector CONSTM1_RTX patterns.
+   3. Vector broadcast patterns.
+   4. UNSPEC_TLS_GD patterns.
+   5. UNSPEC_TLS_LD_BASE patterns.
+   6. UNSPEC_TLSDESC patterns.
+
+   generate a single pattern whose destination is used to replace the
+   source in all identical patterns.
+
+   NB: We want to generate a pattern, which is executed only once, to
+   cover the whole function.  The LCM algorithm isn't appropriate here
+   since it may place a pattern inside the loop.  */
+
+unsigned int
+pass_x86_cse::x86_cse (void)
 {
   timevar_push (TV_MACH_DEP);
 
-  auto_vec<redundant_load *> loads;
-  redundant_load *load;
+  auto_vec<redundant_pattern *> loads;
+  redundant_pattern *load;
   basic_block bb;
   rtx_insn *insn;
   unsigned int i;
+  auto_bitmap updated_gnu_tls_insns;
+  auto_bitmap updated_gnu2_tls_insns;
 
   df_set_flags (DF_DEFER_INSN_RESCAN);
 
@@ -3700,61 +4411,74 @@ remove_redundant_vector_load (void)
 	  if (!NONDEBUG_INSN_P (insn))
 	    continue;
 
+	  bool matched = false;
+	  /* Remove redundant pattens if there are more than 2 of
+	     them.  */
+	  unsigned int threshold = 2;
+
 	  rtx set = single_set (insn);
-	  if (!set)
+	  if (!set && !CALL_P (insn))
 	    continue;
 
-	  /* Record single set vector instruction with CONST0_RTX and
-	     CONSTM1_RTX source.  Record basic blocks with CONST0_RTX and
-	     CONSTM1_RTX.  Count CONST0_RTX and CONSTM1_RTX.  Record the
-	     maximum size of CONST0_RTX and CONSTM1_RTX.  */
+	  tlsdesc_val = nullptr;
 
-	  rtx dest = SET_DEST (set);
-	  machine_mode mode = GET_MODE (dest);
-	  /* Skip non-vector instruction.  */
-	  if (!VECTOR_MODE_P (mode))
-	    continue;
+	  attr_tls64 tls64 = get_attr_tls64 (insn);
+	  switch (tls64)
+	    {
+	    case TLS64_GD:
+	    case TLS64_LD_BASE:
+	      /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE.  */
+	      if (candidate_gnu_tls_p (insn, tls64))
+		break;
+	      continue;
 
-	  rtx src = SET_SRC (set);
-	  /* Skip non-vector load instruction.  */
-	  if (!REG_P (dest) && !SUBREG_P (dest))
-	    continue;
+	    case TLS64_CALL:
+	    case TLS64_COMBINE:
+	      /* Verify UNSPEC_TLSDESC.  */
+	      if (candidate_gnu2_tls_p (set, tls64))
+		break;
+	      continue;
 
-	  rtx_insn *def_insn;
-	  machine_mode scalar_mode;
-	  x86_cse_kind kind;
-	  rtx val = ix86_broadcast_inner (src, mode, &scalar_mode,
-					  &kind, &def_insn);
-	  if (!val)
-	    continue;
+	    case TLS64_LEA:
+	      /* Skip TLS64_LEA.  */
+	      continue;
 
-	   /* Remove redundant register loads if there are more than 2
-	      loads will be used.  */
-	  unsigned int threshold = 2;
+	    case TLS64_NONE:
+	      if (!set)
+		continue;
 
-	  /* Check if there is a matching redundant vector load.   */
-	  bool matched = false;
+	      /* Check for vector broadcast.  */
+	      if (candidate_vector_p (set))
+		break;
+	      continue;
+	    }
+
+	  /* Check if there is a matching redundant load.   */
 	  FOR_EACH_VEC_ELT (loads, i, load)
 	    if (load->val
 		&& load->kind == kind
 		&& load->mode == scalar_mode
 		&& (load->bb == bb
-		    || kind < X86_CSE_VEC_DUP
+		    || kind != X86_CSE_VEC_DUP
 		    /* Non all 0s/1s vector load must be in the same
 		       basic block if it is in a recursive call.  */
 		    || !recursive_call_p)
 		&& rtx_equal_p (load->val, val))
 	      {
-		/* Record vector instruction.  */
+		/* Record instruction.  */
 		bitmap_set_bit (load->insns, INSN_UID (insn));
 
 		/* Record the maximum vector size.  */
-		if (load->size < GET_MODE_SIZE (mode))
+		if (kind <= X86_CSE_VEC_DUP
+		    && load->size < GET_MODE_SIZE (mode))
 		  load->size = GET_MODE_SIZE (mode);
 
 		/* Record the basic block.  */
 		bitmap_set_bit (load->bbs, bb->index);
+
+		/* Increment the count.  */
 		load->count++;
+
 		matched = true;
 		break;
 	      }
@@ -3762,10 +4486,17 @@ remove_redundant_vector_load (void)
 	  if (matched)
 	    continue;
 
-	  /* We see this vector broadcast the first time.  */
-	  load = new redundant_load;
+	  /* We see this instruction the first time.  Record the
+	     redundant source value, its mode, the destination size,
+	     instruction which defines the redundant source value,
+	     instruction basic block and the instruction kind.  */
+	  load = new redundant_pattern;
 
 	  load->val = copy_rtx (val);
+	  if (tlsdesc_val)
+	    load->tlsdesc_val = copy_rtx (tlsdesc_val);
+	  else
+	    load->tlsdesc_val = nullptr;
 	  load->mode = scalar_mode;
 	  load->size = GET_MODE_SIZE (mode);
 	  load->def_insn = def_insn;
@@ -3782,49 +4513,64 @@ remove_redundant_vector_load (void)
     }
 
   bool replaced = false;
-  rtx reg, broadcast_source, broadcast_reg;
   FOR_EACH_VEC_ELT (loads, i, load)
     if (load->count >= load->threshold)
       {
-	machine_mode mode = ix86_get_vector_cse_mode (load->size,
-						      load->mode);
-	broadcast_reg = gen_reg_rtx (mode);
-	if (load->def_insn)
-	  {
-	    /* Replace redundant vector loads with a single vector load
-	       in the same basic block.  */
-	    reg = load->val;
-	    if (load->mode != GET_MODE (reg))
-	      reg = gen_rtx_SUBREG (load->mode, reg, 0);
-	    broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
-	    replace_vector_const (mode, broadcast_reg, load->insns,
-				  load->mode);
-	  }
-	else
+	machine_mode mode;
+	rtx reg, broadcast_source, broadcast_reg;
+	replaced = true;
+	switch (load->kind)
 	  {
-	    /* This is a constant integer/double vector.  If the
-	       inner scalar is 0 or -1, set vector to CONST0_RTX
-	       or CONSTM1_RTX directly.  */
-	    rtx reg;
-	    switch (load->kind)
+	  case X86_CSE_TLS_GD:
+	  case X86_CSE_TLS_LD_BASE:
+	  case X86_CSE_TLSDESC:
+	    broadcast_reg = gen_reg_rtx (load->mode);
+	    replace_tls_call (broadcast_reg, load->insns,
+			      (load->kind == X86_CSE_TLSDESC
+			       ? updated_gnu2_tls_insns
+			       : updated_gnu_tls_insns));
+	    load->broadcast_reg = broadcast_reg;
+	    break;
+
+	  case X86_CSE_CONST0_VECTOR:
+	  case X86_CSE_CONSTM1_VECTOR:
+	  case X86_CSE_VEC_DUP:
+	    mode = ix86_get_vector_cse_mode (load->size, load->mode);
+	    broadcast_reg = gen_reg_rtx (mode);
+	    if (load->def_insn)
 	      {
-	      case X86_CSE_CONST0_VECTOR:
-		broadcast_source = CONST0_RTX (mode);
-		break;
-	      case X86_CSE_CONSTM1_VECTOR:
-		broadcast_source = CONSTM1_RTX (mode);
-		break;
-	      default:
-		reg = gen_reg_rtx (load->mode);
+		/* Replace redundant vector loads with a single vector
+		   load in the same basic block.  */
+		reg = load->val;
+		if (load->mode != GET_MODE (reg))
+		  reg = gen_rtx_SUBREG (load->mode, reg, 0);
 		broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
-		break;
 	      }
+	    else
+	      /* This is a constant integer/double vector.  If the
+		 inner scalar is 0 or -1, set vector to CONST0_RTX
+		 or CONSTM1_RTX directly.  */
+	      switch (load->kind)
+		{
+		case X86_CSE_CONST0_VECTOR:
+		  broadcast_source = CONST0_RTX (mode);
+		  break;
+		case X86_CSE_CONSTM1_VECTOR:
+		  broadcast_source = CONSTM1_RTX (mode);
+		  break;
+		case X86_CSE_VEC_DUP:
+		  reg = gen_reg_rtx (load->mode);
+		  broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
 	    replace_vector_const (mode, broadcast_reg, load->insns,
 				  load->mode);
+	    load->broadcast_source = broadcast_source;
+	    load->broadcast_reg = broadcast_reg;
+	    break;
 	  }
-	load->broadcast_source = broadcast_source;
-	load->broadcast_reg = broadcast_reg;
-	replaced = true;
       }
 
   if (replaced)
@@ -3839,43 +4585,75 @@ remove_redundant_vector_load (void)
       FOR_EACH_VEC_ELT (loads, i, load)
 	if (load->count >= load->threshold)
 	  {
+	    rtx set;
 	    if (load->def_insn)
-	      {
-		/* Insert a broadcast after the original scalar
-		   definition.  */
-		rtx set = gen_rtx_SET (load->broadcast_reg,
-				       load->broadcast_source);
-		insn = emit_insn_after (set, load->def_insn);
-
-		if (cfun->can_throw_non_call_exceptions)
-		  {
-		    /* Handle REG_EH_REGION note in DEF_INSN.  */
-		    rtx note = find_reg_note (load->def_insn,
-					      REG_EH_REGION, nullptr);
-		    if (note)
-		      {
-			control_flow_insns.safe_push (load->def_insn);
-			add_reg_note (insn, REG_EH_REGION,
-				      XEXP (note, 0));
-		      }
-		  }
+	      switch (load->kind)
+		{
+		case X86_CSE_TLSDESC:
+		  ix86_place_single_tls_call (load->broadcast_reg,
+					      load->tlsdesc_val,
+					      load->kind,
+					      load->bbs,
+					      updated_gnu_tls_insns,
+					      updated_gnu2_tls_insns,
+					      PATTERN (load->def_insn));
+		  break;
+		case X86_CSE_VEC_DUP:
+		  /* Insert a broadcast after the original scalar
+		     definition.  */
+		  set = gen_rtx_SET (load->broadcast_reg,
+				     load->broadcast_source);
+		  insn = emit_insn_after (set, load->def_insn);
+
+		  if (cfun->can_throw_non_call_exceptions)
+		    {
+		      /* Handle REG_EH_REGION note in DEF_INSN.  */
+		      rtx note = find_reg_note (load->def_insn,
+						REG_EH_REGION, nullptr);
+		      if (note)
+			{
+			  control_flow_insns.safe_push (load->def_insn);
+			  add_reg_note (insn, REG_EH_REGION,
+					XEXP (note, 0));
+			}
+		    }
 
-		if (dump_file)
-		  {
-		    fprintf (dump_file, "\nAdd:\n\n");
-		    print_rtl_single (dump_file, insn);
-		    fprintf (dump_file, "\nafter:\n\n");
-		    print_rtl_single (dump_file, load->def_insn);
-		    fprintf (dump_file, "\n");
-		  }
-	      }
+		  if (dump_file)
+		    {
+		      fprintf (dump_file, "\nAdd:\n\n");
+		      print_rtl_single (dump_file, insn);
+		      fprintf (dump_file, "\nafter:\n\n");
+		      print_rtl_single (dump_file, load->def_insn);
+		      fprintf (dump_file, "\n");
+		    }
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
 	    else
-	      ix86_place_single_vector_set (load->broadcast_reg,
-					    load->broadcast_source,
-					    load->bbs,
-					    (load->kind == X86_CSE_VEC_DUP
-					     ? load->val
-					     : nullptr));
+	      switch (load->kind)
+		{
+		case X86_CSE_TLS_GD:
+		case X86_CSE_TLS_LD_BASE:
+		case X86_CSE_TLSDESC:
+		  ix86_place_single_tls_call (load->broadcast_reg,
+					      (load->kind == X86_CSE_TLSDESC
+					       ? load->tlsdesc_val
+					       : load->val),
+					      load->kind,
+					      load->bbs,
+					      updated_gnu_tls_insns,
+					      updated_gnu2_tls_insns);
+		  break;
+		case X86_CSE_CONST0_VECTOR:
+		case X86_CSE_CONSTM1_VECTOR:
+		case X86_CSE_VEC_DUP:
+		  ix86_place_single_vector_set (load->broadcast_reg,
+						load->broadcast_source,
+						load->bbs,
+						load);
+		  break;
+		}
 	  }
 
       loop_optimizer_finalize ();
@@ -3905,48 +4683,12 @@ remove_redundant_vector_load (void)
   return 0;
 }
 
-namespace {
-
-const pass_data pass_data_remove_redundant_vector_load =
-{
-  RTL_PASS, /* type */
-  "rrvl", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
-  TV_MACH_DEP, /* tv_id */
-  0, /* properties_required */
-  0, /* properties_provided */
-  0, /* properties_destroyed */
-  0, /* todo_flags_start */
-  0, /* todo_flags_finish */
-};
-
-class pass_remove_redundant_vector_load : public rtl_opt_pass
-{
-public:
-  pass_remove_redundant_vector_load (gcc::context *ctxt)
-    : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
-  {}
-
-  /* opt_pass methods: */
-  bool gate (function *fun) final override
-    {
-      return (TARGET_SSE2
-	      && optimize
-	      && optimize_function_for_speed_p (fun));
-    }
-
-  unsigned int execute (function *) final override
-    {
-      return remove_redundant_vector_load ();
-    }
-}; // class pass_remove_redundant_vector_load
-
 } // anon namespace
 
 rtl_opt_pass *
-make_pass_remove_redundant_vector_load (gcc::context *ctxt)
+make_pass_x86_cse (gcc::context *ctxt)
 {
-  return new pass_remove_redundant_vector_load (ctxt);
+  return new pass_x86_cse (ctxt);
 }
 
 /* Convert legacy instructions that clobbers EFLAGS to APX_NF
diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index 2fedbeb..c2db305 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -91,7 +91,6 @@ VECTOR_MODES (FLOAT, 16);     /*         V8HF V4SF V2DF */
 VECTOR_MODES (FLOAT, 32);     /*   V16HF V8SF V4DF V2TF */
 VECTOR_MODES (FLOAT, 64);     /*  V32HF V16SF V8DF V4TF */
 VECTOR_MODES (FLOAT, 128);    /* V64HF V32SF V16DF V8TF */
-VECTOR_MODES (FLOAT, 256);    /* V128HF V64SF V32DF V16TF */
 VECTOR_MODE (FLOAT, HF, 2);   /* 	      	   V2HF */
 VECTOR_MODE (FLOAT, BF, 2);   /* 	      	   V2BF */
 VECTOR_MODE (FLOAT, HF, 6);   /*		   V6HF */
@@ -102,7 +101,6 @@ VECTOR_MODE (INT, QI, 2);     /*                   V2QI */
 VECTOR_MODE (INT, QI, 12);    /*                  V12QI */
 VECTOR_MODE (INT, QI, 14);    /*                  V14QI */
 VECTOR_MODE (INT, HI, 6);     /*                   V6HI */
-VECTOR_MODE (INT, SI, 64);    /* 		  V64SI */
 
 INT_MODE (OI, 32);
 INT_MODE (XI, 64);
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index ca6bb83..abb5dd7 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1172,6 +1172,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 		   OPT_mrecip,
 		   MASK_RECIP),
 
+    IX86_ATTR_YES ("80387",
+		   OPT_m80387,
+		   MASK_80387),
+
     IX86_ATTR_IX86_YES ("general-regs-only",
 			OPT_mgeneral_regs_only,
 			OPTION_MASK_GENERAL_REGS_ONLY),
@@ -1281,6 +1285,8 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 
       else if (type == ix86_opt_yes || type == ix86_opt_no)
 	{
+	  opts_set->x_target_flags |= mask;
+
 	  if (type == ix86_opt_no)
 	    opt_set_p = !opt_set_p;
 
@@ -3556,6 +3562,10 @@ ix86_set_current_function (tree fndecl)
 	    isa = "AVX";
 	  else if (cfun->machine->func_type != TYPE_NORMAL)
 	    isa = "SSE";
+	  else if (TARGET_MMX)
+	    isa = "MMX/3Dnow";
+	  else if (TARGET_80387)
+	    isa = "80387";
 	  else
 	    isa = NULL;
 	}
@@ -3615,6 +3625,18 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
       return NULL_TREE;
     }
 
+  if (TARGET_64BIT)
+    {
+      /* Do not warn when emulating the MS ABI.  */
+      if ((TREE_CODE (*node) != FUNCTION_TYPE
+	   && TREE_CODE (*node) != METHOD_TYPE)
+	  || ix86_function_type_abi (*node) != MS_ABI)
+	warning (OPT_Wattributes, "%qE attribute ignored",
+		 name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+
   /* Can combine regparm with all attributes but fastcall, and thiscall.  */
   if (is_attribute_p ("regparm", name))
     {
@@ -3627,7 +3649,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
 
       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
 	{
-	  error ("regparam and thiscall attributes are not compatible");
+	  error ("regparm and thiscall attributes are not compatible");
 	}
 
       cst = TREE_VALUE (args);
@@ -3648,19 +3670,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
       return NULL_TREE;
     }
 
-  if (TARGET_64BIT)
-    {
-      /* Do not warn when emulating the MS ABI.  */
-      if ((TREE_CODE (*node) != FUNCTION_TYPE
-	   && TREE_CODE (*node) != METHOD_TYPE)
-	  || ix86_function_type_abi (*node) != MS_ABI)
-	warning (OPT_Wattributes, "%qE attribute ignored",
-	         name);
-      *no_add_attrs = true;
-      return NULL_TREE;
-    }
-
-  /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
+  /* Can combine fastcall with sseregparm.  */
   if (is_attribute_p ("fastcall", name))
     {
       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
@@ -3681,8 +3691,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
 	}
     }
 
-  /* Can combine stdcall with fastcall (redundant), regparm and
-     sseregparm.  */
+  /* Can combine stdcall with regparm and sseregparm.  */
   else if (is_attribute_p ("stdcall", name))
     {
       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
@@ -3732,6 +3741,10 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
 	{
 	  error ("cdecl and thiscall attributes are not compatible");
 	}
+      if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("regparm and thiscall attributes are not compatible");
+	}
     }
 
   /* Can combine sseregparm with all attributes.  */
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 06f0288..553b46d 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -35,6 +35,6 @@ along with GCC; see the file COPYING3.  If not see
      PR116174.  */
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
 
-  INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load);
+  INSERT_PASS_AFTER (pass_late_combine, 1, pass_x86_cse);
   INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
   INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 69bc0ee..bdb8bb9 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -290,6 +290,7 @@ extern rtx ix86_tls_module_base (void);
 extern bool ix86_gpr_tls_address_pattern_p (rtx);
 extern bool ix86_tls_address_pattern_p (rtx);
 extern rtx ix86_rewrite_tls_address (rtx);
+extern rtx ix86_tls_get_addr (void);
 
 extern void ix86_expand_vector_init (bool, rtx, rtx);
 extern void ix86_expand_vector_set (bool, rtx, rtx, int);
@@ -430,8 +431,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
   (gcc::context *);
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
-extern rtl_opt_pass *make_pass_remove_redundant_vector_load
-  (gcc::context *);
+extern rtl_opt_pass *make_pass_x86_cse (gcc::context *);
 extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
 extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
 
@@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap);
 /* In i386-expand.cc.  */
 bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
 				   HOST_WIDE_INT*);
+rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 4682db85..471be3e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -11382,6 +11382,23 @@ ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
 
   return cost;
 }
+
+/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.  */
+
+bool
+ix86_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
+				     unsigned int align,
+				     enum by_pieces_operation op,
+				     bool speed_p)
+{
+  /* Return true when we are currently expanding memcpy/memset epilogue
+     with move_by_pieces or store_by_pieces.  */
+  if (cfun->machine->by_pieces_in_use)
+    return true;
+
+  return default_use_by_pieces_infrastructure_p (size, align, op,
+						 speed_p);
+}
 
 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
    this is used for to form addresses to local data when -fPIC is in
@@ -12439,9 +12456,31 @@ ix86_tls_index (void)
 
 static GTY(()) rtx ix86_tls_symbol;
 
-static rtx
+rtx
 ix86_tls_get_addr (void)
 {
+  if (cfun->machine->call_saved_registers
+      == TYPE_NO_CALLER_SAVED_REGISTERS)
+    {
+      /* __tls_get_addr doesn't preserve vector registers.  When a
+	 function with no_caller_saved_registers attribute calls
+	 __tls_get_addr, YMM and ZMM registers will be clobbered.
+	 Issue an error and suggest -mtls-dialect=gnu2 in this case.  */
+      if (cfun->machine->func_type == TYPE_NORMAL)
+	error (G_("%<-mtls-dialect=gnu2%> must be used with a function"
+		  " with the %<no_caller_saved_registers%> attribute"));
+      else
+	error (cfun->machine->func_type == TYPE_EXCEPTION
+	       ? G_("%<-mtls-dialect=gnu2%> must be used with an"
+		    " exception service routine")
+	       : G_("%<-mtls-dialect=gnu2%> must be used with an"
+		    " interrupt service routine"));
+      /* Don't issue the same error twice.  */
+      cfun->machine->func_type = TYPE_NORMAL;
+      cfun->machine->call_saved_registers
+	= TYPE_DEFAULT_CALL_SAVED_REGISTERS;
+    }
+
   if (!ix86_tls_symbol)
     {
       const char *sym
@@ -20007,7 +20046,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	tree utype, ures, vce;
 	utype = unsigned_type_for (TREE_TYPE (arg0));
 	/* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR
-	   instead of ABS_EXPR to hanlde overflow case(TYPE_MIN).  */
+	   instead of ABS_EXPR to handle overflow case(TYPE_MIN).  */
 	ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0);
 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
 	loc = gimple_location (stmt);
@@ -21491,8 +21530,7 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
   /* Register pair for mask registers.  */
   if (mode == P2QImode || mode == P2HImode)
     return 2;
-  if (mode == V64SFmode || mode == V64SImode)
-    return 4;
+
   return 1;
 }
 
@@ -22081,6 +22119,15 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 	    }
 	  /* FALLTHRU */
 	case V32QImode:
+	  if (TARGET_GFNI && constant_op1)
+	    {
+	      /* Use vgf2p8affine.  One extra load for the mask, but in a loop
+		 with enough registers it will be moved out.  So for now don't
+		 account the constant mask load.  This is not quite right
+		 for non loop vectorization.  */
+	      extra = 0;
+	      return ix86_vec_cost (mode, cost->sse_op) + extra;
+	    }
 	  if (TARGET_AVX2)
 	    /* Use vpbroadcast.  */
 	    extra = cost->sse_op;
@@ -22115,6 +22162,11 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 	    count = 9;
 	  return ix86_vec_cost (mode, cost->sse_op * count) + extra;
 
+	case V64QImode:
+	  /* Ignore the mask load for GF2P8AFFINEQB.  */
+	  extra = 0;
+	  return ix86_vec_cost (mode, cost->sse_op) + extra;
+
 	case V2DImode:
 	case V4DImode:
 	  /* V*DImode arithmetic right shift is emulated.  */
@@ -23132,7 +23184,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
 	     So current solution is make constant disp as cheap as possible.  */
 	  if (GET_CODE (addr) == PLUS
 	      && x86_64_immediate_operand (XEXP (addr, 1), Pmode)
-	      /* Only hanlde (reg + disp) since other forms of addr are mostly LEA,
+	      /* Only handle (reg + disp) since other forms of addr are mostly LEA,
 		 there's no additional cost for the plus of disp.  */
 	      && register_operand (XEXP (addr, 0), Pmode))
 	    {
@@ -25211,20 +25263,14 @@ asm_preferred_eh_data_format (int code, int global)
   return DW_EH_PE_absptr;
 }
 
-/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+/* Worker for ix86_builtin_vectorization_cost and the fallback calls
+   from ix86_vector_costs::add_stmt_cost.  */
 static int
-ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
-                                 tree vectype, int)
+ix86_default_vector_cost (enum vect_cost_for_stmt type_of_cost,
+			  machine_mode mode)
 {
-  bool fp = false;
-  machine_mode mode = TImode;
+  bool fp = FLOAT_MODE_P (mode);
   int index;
-  if (vectype != NULL)
-    {
-      fp = FLOAT_TYPE_P (vectype);
-      mode = TYPE_MODE (vectype);
-    }
-
   switch (type_of_cost)
     {
       case scalar_stmt:
@@ -25283,14 +25329,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 			      COSTS_N_INSNS
 				 (ix86_cost->gather_static
 				  + ix86_cost->gather_per_elt
-				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+				    * GET_MODE_NUNITS (mode)) / 2);
 
       case vector_scatter_store:
         return ix86_vec_cost (mode,
 			      COSTS_N_INSNS
 				 (ix86_cost->scatter_static
 				  + ix86_cost->scatter_per_elt
-				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+				    * GET_MODE_NUNITS (mode)) / 2);
 
       case cond_branch_taken:
         return ix86_cost->cond_taken_branch_cost;
@@ -25308,7 +25354,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 
       case vec_construct:
 	{
-	  int n = TYPE_VECTOR_SUBPARTS (vectype);
+	  int n = GET_MODE_NUNITS (mode);
 	  /* N - 1 element inserts into an SSE vector, the possible
 	     GPR -> XMM move is accounted for in add_stmt_cost.  */
 	  if (GET_MODE_BITSIZE (mode) <= 128)
@@ -25336,6 +25382,17 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
     }
 }
 
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+static int
+ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+				 tree vectype, int)
+{
+  machine_mode mode = TImode;
+  if (vectype != NULL)
+    mode = TYPE_MODE (vectype);
+  return ix86_default_vector_cost (type_of_cost, mode);
+}
+
 
 /* This function returns the calling abi specific va_list type node.
    It returns  the FNDECL specific va_list type.  */
@@ -25768,15 +25825,20 @@ private:
   unsigned m_num_sse_needed[3];
   /* Number of 256-bit vector permutation.  */
   unsigned m_num_avx256_vec_perm[3];
+  /* Number of reductions for FMA/DOT_PROD_EXPR/SAD_EXPR  */
+  unsigned m_num_reduc[X86_REDUC_LAST];
+  /* Don't do unroll if m_prefer_unroll is false, default is true.  */
+  bool m_prefer_unroll;
 };
 
 ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
   : vector_costs (vinfo, costing_for_scalar),
     m_num_gpr_needed (),
     m_num_sse_needed (),
-    m_num_avx256_vec_perm ()
-{
-}
+    m_num_avx256_vec_perm (),
+    m_num_reduc (),
+    m_prefer_unroll (true)
+{}
 
 /* Implement targetm.vectorize.create_costs.  */
 
@@ -25789,7 +25851,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
 unsigned
 ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 				  stmt_vec_info stmt_info, slp_tree node,
-				  tree vectype, int misalign,
+				  tree vectype, int,
 				  vect_cost_model_location where)
 {
   unsigned retval = 0;
@@ -26073,6 +26135,125 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	}
     }
 
+  /* Record number of load/store/gather/scatter in vectorized body.  */
+  if (where == vect_body && !m_costing_for_scalar)
+    {
+      switch (kind)
+	{
+	  /* Emulated gather/scatter or any scalarization.  */
+	case scalar_load:
+	case scalar_stmt:
+	case scalar_store:
+	case vector_gather_load:
+	case vector_scatter_store:
+	  m_prefer_unroll = false;
+	  break;
+
+	case vector_stmt:
+	case vec_to_scalar:
+	  /* Count number of reduction FMA and "real" DOT_PROD_EXPR,
+	     unroll in the vectorizer will enable partial sum.  */
+	  if (stmt_info
+	      && vect_is_reduction (stmt_info)
+	      && stmt_info->stmt)
+	    {
+	      /* Handle __builtin_fma.  */
+	      if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA)
+		{
+		  m_num_reduc[X86_REDUC_FMA] += count;
+		  break;
+		}
+
+	      if (!is_gimple_assign (stmt_info->stmt))
+		break;
+
+	      tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
+	      machine_mode inner_mode = GET_MODE_INNER (mode);
+	      tree rhs1, rhs2;
+	      bool native_vnni_p = true;
+	      gimple* def;
+	      machine_mode mode_rhs;
+	      switch (subcode)
+		{
+		case PLUS_EXPR:
+		case MINUS_EXPR:
+		  if (!fp || !flag_associative_math
+		      || flag_fp_contract_mode != FP_CONTRACT_FAST)
+		    break;
+
+		  /* FMA condition for different modes.  */
+		  if (((inner_mode == DFmode || inner_mode == SFmode)
+		       && !TARGET_FMA && !TARGET_AVX512VL)
+		      || (inner_mode == HFmode && !TARGET_AVX512FP16)
+		      || (inner_mode == BFmode && !TARGET_AVX10_2))
+		    break;
+
+		  /* MULT_EXPR + PLUS_EXPR/MINUS_EXPR is transformed
+		     to FMA/FNMA after vectorization.  */
+		  rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
+		  rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
+		  if (subcode == PLUS_EXPR
+		      && TREE_CODE (rhs1) == SSA_NAME
+		      && (def = SSA_NAME_DEF_STMT (rhs1), true)
+		      && is_gimple_assign (def)
+		      && gimple_assign_rhs_code (def) == MULT_EXPR)
+		    m_num_reduc[X86_REDUC_FMA] += count;
+		  else if (TREE_CODE (rhs2) == SSA_NAME
+			   && (def = SSA_NAME_DEF_STMT (rhs2), true)
+			   && is_gimple_assign (def)
+			   && gimple_assign_rhs_code (def) == MULT_EXPR)
+		    m_num_reduc[X86_REDUC_FMA] += count;
+		  break;
+
+		  /* Vectorizer lane_reducing_op_p supports DOT_PROX_EXPR,
+		     WIDEN_SUM_EXPR and SAD_EXPR, x86 backend only supports
+		     SAD_EXPR (usad{v16qi,v32qi,v64qi}) and DOT_PROD_EXPR.  */
+		case DOT_PROD_EXPR:
+		  rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
+		  mode_rhs = TYPE_MODE (TREE_TYPE (rhs1));
+		  if (mode_rhs == QImode)
+		    {
+		      rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
+		      signop signop1_p = TYPE_SIGN (TREE_TYPE (rhs1));
+		      signop signop2_p = TYPE_SIGN (TREE_TYPE (rhs2));
+
+		      /* vpdpbusd.  */
+		      if (signop1_p != signop2_p)
+			native_vnni_p
+			  = (GET_MODE_SIZE (mode) == 64
+			     ? TARGET_AVX512VNNI
+			     : ((TARGET_AVX512VNNI && TARGET_AVX512VL)
+				|| TARGET_AVXVNNI));
+		      else
+			/* vpdpbssd.  */
+			native_vnni_p
+			  = (GET_MODE_SIZE (mode) == 64
+			     ? TARGET_AVX10_2
+			     : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2));
+		    }
+		  m_num_reduc[X86_REDUC_DOT_PROD] += count;
+
+		  /* Dislike to do unroll and partial sum for
+		     emulated DOT_PROD_EXPR.  */
+		  if (!native_vnni_p)
+		    m_num_reduc[X86_REDUC_DOT_PROD] += 3 * count;
+		  break;
+
+		case SAD_EXPR:
+		  m_num_reduc[X86_REDUC_SAD] += count;
+		  break;
+
+		default:
+		  break;
+		}
+	    }
+
+	default:
+	  break;
+	}
+    }
+
+
   combined_fn cfn;
   if ((kind == vector_stmt || kind == scalar_stmt)
       && stmt_info
@@ -26128,32 +26309,23 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
      (AGU and load ports).  Try to account for this by scaling the
      construction cost by the number of elements involved.  */
   if ((kind == vec_construct || kind == vec_to_scalar)
-      && ((stmt_info
-	   && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
-	       || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
-	   && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
-		&& (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+      && ((node
+	   && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+		 || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
+		     && SLP_TREE_LANES (node) == 1))
+		&& (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
+					(SLP_TREE_REPRESENTATIVE (node))))
 		    != INTEGER_CST))
-	       || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
-		   == VMAT_GATHER_SCATTER)))
-	  || (node
-	      && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
-		    || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
-			&& SLP_TREE_LANES (node) == 1))
-		   && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
-					     (SLP_TREE_REPRESENTATIVE (node))))
-		      != INTEGER_CST))
-		  || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
-		      == VMAT_GATHER_SCATTER)))))
-    {
-      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+	       || mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))))))
+    {
+      stmt_cost = ix86_default_vector_cost (kind, mode);
       stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
     }
   else if ((kind == vec_construct || kind == scalar_to_vec)
 	   && node
 	   && SLP_TREE_DEF_TYPE (node) == vect_external_def)
     {
-      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+      stmt_cost = ix86_default_vector_cost (kind, mode);
       unsigned i;
       tree op;
       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
@@ -26217,7 +26389,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	  TREE_VISITED (op) = 0;
     }
   if (stmt_cost == -1)
-    stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+    stmt_cost = ix86_default_vector_cost (kind, mode);
 
   if (kind == vec_perm && vectype
       && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
@@ -26288,6 +26460,41 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
 	  && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
 	      > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
 	m_costs[vect_body] = INT_MAX;
+
+      bool any_reduc_p = false;
+      for (int i = 0; i != X86_REDUC_LAST; i++)
+	if (m_num_reduc[i])
+	  {
+	    any_reduc_p = true;
+	    break;
+	  }
+
+      if (any_reduc_p
+	  /* Not much gain for loop with gather and scatter.  */
+	  && m_prefer_unroll
+	  && !LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+	{
+	  unsigned unroll_factor
+	    = OPTION_SET_P (ix86_vect_unroll_limit)
+	    ? ix86_vect_unroll_limit
+	    : ix86_cost->vect_unroll_limit;
+
+	  if (unroll_factor > 1)
+	    {
+	      for (int i = 0 ; i != X86_REDUC_LAST; i++)
+		{
+		  if (m_num_reduc[i])
+		    {
+		      unsigned tmp = CEIL (ix86_cost->reduc_lat_mult_thr[i],
+					   m_num_reduc[i]);
+		      unroll_factor = MIN (unroll_factor, tmp);
+		    }
+		}
+
+	      m_suggested_unroll_factor  = 1 << ceil_log2 (unroll_factor);
+	    }
+	}
+
     }
 
   ix86_vect_estimate_reg_pressure ();
@@ -27171,9 +27378,9 @@ ix86_memtag_can_tag_addresses ()
   return ix86_lam_type != lam_none && TARGET_LP64;
 }
 
-/* Implement TARGET_MEMTAG_TAG_SIZE.  */
+/* Implement TARGET_MEMTAG_TAG_BITSIZE.  */
 unsigned char
-ix86_memtag_tag_size ()
+ix86_memtag_tag_bitsize ()
 {
   return IX86_HWASAN_TAG_SIZE;
 }
@@ -27744,6 +27951,10 @@ static const scoped_attribute_specs *const ix86_attribute_table[] =
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST ix86_address_cost
 
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
+  ix86_use_by_pieces_infrastructure_p
+
 #undef TARGET_OVERLAP_OP_BY_PIECES_P
 #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
 
@@ -28147,8 +28358,8 @@ ix86_libgcc_floating_mode_supported_p
 #undef TARGET_MEMTAG_UNTAGGED_POINTER
 #define TARGET_MEMTAG_UNTAGGED_POINTER ix86_memtag_untagged_pointer
 
-#undef TARGET_MEMTAG_TAG_SIZE
-#define TARGET_MEMTAG_TAG_SIZE ix86_memtag_tag_size
+#undef TARGET_MEMTAG_TAG_BITSIZE
+#define TARGET_MEMTAG_TAG_BITSIZE ix86_memtag_tag_bitsize
 
 #undef TARGET_GEN_CCMP_FIRST
 #define TARGET_GEN_CCMP_FIRST ix86_gen_ccmp_first
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 791f3b9..ac0ce68 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -102,6 +102,15 @@ struct stringop_algs
 #define COSTS_N_BYTES(N) ((N) * 2)
 #endif
 
+
+enum ix86_reduc_unroll_factor{
+  X86_REDUC_FMA,
+  X86_REDUC_DOT_PROD,
+  X86_REDUC_SAD,
+
+  X86_REDUC_LAST
+};
+
 /* Define the specific costs for a given cpu.  NB: hard_register is used
    by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute
    hard register move costs by register allocator.  Relative costs of
@@ -225,6 +234,13 @@ struct processor_costs {
 				   to number of instructions executed in
 				   parallel.  See also
 				   ix86_reassociation_width.  */
+  const unsigned reduc_lat_mult_thr[X86_REDUC_LAST];
+				/* Latency times throughput of
+				   FMA/DOT_PROD_EXPR/SAD_EXPR,
+				   it's used to determine unroll
+				   factor in the vectorizer.  */
+  const unsigned vect_unroll_limit;    /* Limit how much the autovectorizer
+					  may unroll a loop.  */
   struct stringop_algs *memcpy, *memset;
   const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
 					  cost model.  */
@@ -644,7 +660,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
   {"cpu_64", "%{" OPT_ARCH64 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \
   {"arch", "%{!march=*:-march=%(VALUE)}"},			   \
   {"arch_32", "%{" OPT_ARCH32 ":%{!march=*:-march=%(VALUE)}}"},	   \
-  {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"},
+  {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"},    \
+  {"tls", "%{!mtls-dialect=*:-mtls-dialect=%(VALUE)}"},
 
 /* Specs for the compiler proper */
 
@@ -2477,9 +2494,9 @@ constexpr wide_int_bitmask PTA_DIAMONDRAPIDS = PTA_GRANITERAPIDS_D
   | PTA_MOVRS | PTA_AMX_MOVRS | PTA_USER_MSR;
 
 constexpr wide_int_bitmask PTA_BDVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
-  | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
-  | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
-  | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
+  | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_POPCNT | PTA_LZCNT
+  | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL
+  | PTA_AVX | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
 constexpr wide_int_bitmask PTA_BDVER2 = PTA_BDVER1 | PTA_BMI | PTA_TBM
   | PTA_F16C | PTA_FMA;
 constexpr wide_int_bitmask PTA_BDVER3 = PTA_BDVER2 | PTA_XSAVEOPT
@@ -2487,13 +2504,13 @@ constexpr wide_int_bitmask PTA_BDVER3 = PTA_BDVER2 | PTA_XSAVEOPT
 constexpr wide_int_bitmask PTA_BDVER4 = PTA_BDVER3 | PTA_AVX2 | PTA_BMI2
   | PTA_RDRND | PTA_MOVBE | PTA_MWAITX;
 
-constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
-  | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
-  | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2
-  | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT
-  | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
-  | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES | PTA_SHA | PTA_LZCNT
-  | PTA_POPCNT;
+constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
+  | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 |  PTA_POPCNT | PTA_LZCNT
+  | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL
+  | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
+  | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE
+  | PTA_MWAITX | PTA_ADX | PTA_RDSEED | PTA_CLZERO | PTA_CLFLUSHOPT
+  | PTA_XSAVEC | PTA_XSAVES | PTA_SHA;
 constexpr wide_int_bitmask PTA_ZNVER2 = PTA_ZNVER1 | PTA_CLWB | PTA_RDPID
   | PTA_WBNOINVD;
 constexpr wide_int_bitmask PTA_ZNVER3 = PTA_ZNVER2 | PTA_VAES | PTA_VPCLMULQDQ
@@ -2506,19 +2523,19 @@ constexpr wide_int_bitmask PTA_ZNVER5 = PTA_ZNVER4 | PTA_AVXVNNI
   | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_PREFETCHI;
 
 constexpr wide_int_bitmask PTA_BTVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
-  | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16
-  | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
+  | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_LZCNT | PTA_POPCNT
+  | PTA_ABM | PTA_CX16 | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
 constexpr wide_int_bitmask PTA_BTVER2 = PTA_BTVER1 | PTA_SSE4_1 | PTA_SSE4_2
   | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_BMI | PTA_F16C | PTA_MOVBE
   | PTA_XSAVEOPT;
 
 constexpr wide_int_bitmask PTA_LUJIAZUI = PTA_64BIT | PTA_MMX | PTA_SSE
-  | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
-  | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_BMI | PTA_BMI2 | PTA_PRFCHW
-  | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE
-  | PTA_ADX | PTA_RDSEED | PTA_POPCNT;
+  | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_LZCNT | PTA_POPCNT | PTA_ABM
+  | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_BMI
+  | PTA_BMI2 | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
+  | PTA_RDRND | PTA_MOVBE | PTA_ADX | PTA_RDSEED;
 constexpr wide_int_bitmask PTA_YONGFENG = PTA_LUJIAZUI | PTA_AVX | PTA_AVX2
-  | PTA_F16C | PTA_FMA | PTA_SHA | PTA_LZCNT;
+  | PTA_F16C | PTA_FMA | PTA_SHA;
 
 #ifndef GENERATOR_FILE
 
@@ -2865,6 +2882,9 @@ struct GTY(()) machine_function {
      approximation.  */
   BOOL_BITFIELD tls_descriptor_call_expanded_p : 1;
 
+  /* True if TLS descriptor is called more than once.  */
+  BOOL_BITFIELD tls_descriptor_call_multiple_p : 1;
+
   /* If true, the current function has a STATIC_CHAIN is placed on the
      stack below the return address.  */
   BOOL_BITFIELD static_chain_on_stack : 1;
@@ -2934,6 +2954,9 @@ struct GTY(()) machine_function {
   /* True if this is a recursive function.  */
   BOOL_BITFIELD recursive_function : 1;
 
+  /* True if by_pieces op is currently in use.  */
+  BOOL_BITFIELD by_pieces_in_use : 1;
+
   /* The largest alignment, in bytes, of stack slot actually used.  */
   unsigned int max_used_stack_alignment;
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index eb52699..cea6c15 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -901,6 +901,10 @@
 (define_attr "avx_partial_xmm_update" "false,true"
   (const_string "false"))
 
+;; Define attribute to indicate 64-bit TLS insns.
+(define_attr "tls64" "gd,ld_base,call,combine,lea,none"
+  (const_string "none"))
+
 ;; Define attribute to classify add/sub insns that consumes carry flag (CF)
 (define_attr "use_carry" "0,1" (const_string "0"))
 
@@ -1618,10 +1622,8 @@
 	(compare
 	  (match_operand:QI 0 "nonimmediate_operand" "QBn")
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))]
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%h1, %0|%0, %h1}"
   [(set_attr "addr" "gpr8")
@@ -1632,10 +1634,8 @@
   [(set (reg FLAGS_REG)
 	(compare
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 0 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 0 "int248_register_operand" "Q")]) 0)
 	  (match_operand:QI 1 "const0_operand")))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "test{b}\t%h0, %h0"
@@ -1657,10 +1657,8 @@
   [(set (reg FLAGS_REG)
 	(compare
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 0 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 0 "int248_register_operand" "Q")]) 0)
 	  (match_operand:QI 1 "general_operand" "QnBn")))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%1, %h0|%h0, %1}"
@@ -1672,15 +1670,11 @@
   [(set (reg FLAGS_REG)
 	(compare
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 0 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 0 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))]
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%h1, %h0|%h0, %h1}"
   [(set_attr "type" "icmp")
@@ -2968,7 +2962,8 @@
 	(match_operand:SWI248 1 "const_int_operand"))]
   "optimize_insn_for_size_p () && optimize_size > 1
    && operands[1] != const0_rtx
-   && operands[1] != constm1_rtx
+   && (operands[1] != constm1_rtx
+       || (<MODE>mode == DImode && LEGACY_INT_REG_P (operands[0])))
    && IN_RANGE (INTVAL (operands[1]), -128, 127)
    && !ix86_red_zone_used
    && REGNO (operands[0]) != SP_REG"
@@ -3479,10 +3474,8 @@
   [(set (strict_low_part
 	  (match_operand:QI 0 "register_operand" "+Q"))
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0))]
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "mov{b}\t{%h1, %0|%0, %h1}"
   [(set_attr "type" "imov")
@@ -3565,10 +3558,8 @@
 (define_insn "*extzvqi"
   [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn,?R")
 	(subreg:QI
-	  (match_operator:SWI248 2 "extract_operator"
-	    [(match_operand 1 "int248_register_operand" "Q,Q")
-	     (const_int 8)
-	     (const_int 8)]) 0))]
+	  (match_operator:SWI248 2 "extract_high_operator"
+	    [(match_operand 1 "int248_register_operand" "Q,Q")]) 0))]
   ""
 {
   switch (get_attr_type (insn))
@@ -3689,10 +3680,8 @@
 	  (match_operand 0 "int248_register_operand" "+Q")
 	  (const_int 8)
 	  (const_int 8))
-	(match_operator:SWI248 2 "extract_operator"
-	  [(match_operand 1 "int248_register_operand" "Q")
-	   (const_int 8)
-	   (const_int 8)]))]
+	(match_operator:SWI248 2 "extract_high_operator"
+	  [(match_operand 1 "int248_register_operand" "Q")]))]
   ""
   "mov{b}\t{%h1, %h0|%h0, %h1}"
   [(set_attr "type" "imov")
@@ -5259,10 +5248,8 @@
   [(set (match_operand:SWI24 0 "register_operand" "=R")
 	(sign_extend:SWI24
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))]
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)))]
   ""
   "movs{b<SWI24:imodesuffix>|x}\t{%h1, %0|%0, %h1}"
    [(set_attr "type" "imovx")
@@ -7008,10 +6995,8 @@
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q"))
 	(plus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q,Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)
 	  (match_operand:QI 1 "nonimmediate_operand" "0,!qm")))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
@@ -7025,8 +7010,8 @@
      [(set (strict_low_part (match_dup 0))
 	   (plus:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 2) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 2) (const_int 8) (const_int 8)) 0)
 	     (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -7037,29 +7022,25 @@
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q"))
 	(plus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "#"
   "&& reload_completed"
   [(set (strict_low_part (match_dup 0))
 	(subreg:QI
-	  (match_op_dup 4
-	    [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 2) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (strict_low_part (match_dup 0))
 	   (plus:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 1) (const_int 8) (const_int 8)) 0)
 	     (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -7474,10 +7455,8 @@
   [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn")
 	(plus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)
 	  (match_operand:QI 1 "nonimmediate_operand" "0")))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -7490,29 +7469,25 @@
   [(set (match_operand:QI 0 "register_operand" "=&Q")
 	(plus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "#"
   "&& reload_completed"
   [(set (match_dup 0)
 	(subreg:QI
-	  (match_op_dup 4
-	    [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 2) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (match_dup 0)
 	   (plus:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 1) (const_int 8) (const_int 8)) 0)
 	   (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -7542,10 +7517,8 @@
 	(subreg:SWI248
 	  (plus:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	    (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -7580,8 +7553,8 @@
 	   (subreg:SWI248
 	     (plus:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -7601,15 +7574,11 @@
 	(subreg:SWI248
 	  (plusminus:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "<comm>0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "<comm>0,!Q")]) 0)
 	    (subreg:QI
-	      (match_operator:SWI248 4 "extract_operator"
-		[(match_operand 2 "int248_register_operand" "Q,Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)) 0))
+	      (match_operator:SWI248 4 "extract_high_operator"
+		[(match_operand 2 "int248_register_operand" "Q,Q")]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "@
@@ -7628,11 +7597,11 @@
 	   (subreg:SWI248
 	     (plusminus:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (subreg:QI
-		 (match_op_dup 4
-		   [(match_dup 2) (const_int 8) (const_int 8)]) 0)) 0))
+		 (zero_extract:SWI248
+		   (match_dup 2) (const_int 8) (const_int 8)) 0)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
@@ -8229,10 +8198,8 @@
 	(minus:QI
 	  (match_operand:QI 1 "nonimmediate_operand" "0,!qm")
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q,Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "@
@@ -8246,8 +8213,8 @@
 	   (minus:QI
 	     (match_dup 0)
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+	       (zero_extract:SWI248
+		 (match_dup 2) (const_int 8) (const_int 8)) 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
@@ -8257,30 +8224,26 @@
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q"))
 	(minus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "#"
   "&& reload_completed"
   [(set (strict_low_part (match_dup 0))
 	(subreg:QI
-	  (match_op_dup 3
-	    [(match_dup 1) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 1) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (strict_low_part (match_dup 0))
 	   (minus:QI
 	   (match_dup 0)
 	     (subreg:QI
-	       (match_op_dup 4
-		 [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+	       (zero_extract:SWI248
+		 (match_dup 2) (const_int 8) (const_int 8)) 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
@@ -8331,10 +8294,8 @@
 	(minus:QI
 	  (match_operand:QI 1 "nonimmediate_operand" "0")
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "sub{b}\t{%h2, %0|%0, %h2}"
@@ -8346,30 +8307,26 @@
   [(set (match_operand:QI 0 "register_operand" "=&Q")
 	(minus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "#"
   "&& reload_completed"
   [(set (match_dup 0)
 	(subreg:QI
-	  (match_op_dup 3
-	    [(match_dup 1) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 1) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (match_dup 0)
 	   (minus:QI
 	     (match_dup 0)
 	     (subreg:QI
-	       (match_op_dup 4
-		 [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+	       (zero_extract:SWI248
+		 (match_dup 2) (const_int 8) (const_int 8)) 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
@@ -8384,10 +8341,8 @@
 	(subreg:SWI248
 	  (minus:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	    (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -8406,8 +8361,8 @@
 	   (subreg:SWI248
 	     (minus:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -12355,10 +12310,8 @@
 	(compare
 	  (and:QI
 	    (subreg:QI
-	      (match_operator:SWI248 2 "extract_operator"
-		[(match_operand 0 "int248_register_operand" "Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 2 "extract_high_operator"
+		[(match_operand 0 "int248_register_operand" "Q")]) 0)
 	    (match_operand:QI 1 "general_operand" "QnBn"))
 	  (const_int 0)))]
   "ix86_match_ccmode (insn, CCNOmode)"
@@ -12372,15 +12325,11 @@
 	(compare
 	  (and:QI
 	    (subreg:QI
-	      (match_operator:SWI248 2 "extract_operator"
-		[(match_operand 0 "int248_register_operand" "Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 2 "extract_high_operator"
+		[(match_operand 0 "int248_register_operand" "Q")]) 0)
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "Q")
-		 (const_int 8)
-		 (const_int 8)]) 0))
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "Q")]) 0))
 	  (const_int 0)))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "test{b}\t{%h1, %h0|%h0, %h1}"
@@ -12969,10 +12918,8 @@
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q"))
 	(any_logic:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q,Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)
 	  (match_operand:QI 1 "nonimmediate_operand" "0,!qm")))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
@@ -12986,8 +12933,8 @@
      [(set (strict_low_part (match_dup 0))
 	   (any_logic:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 2) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 2) (const_int 8) (const_int 8)) 0)
 	     (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -12998,29 +12945,25 @@
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q"))
 	(any_logic:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "#"
   "&& reload_completed"
   [(set (strict_low_part (match_dup 0))
 	(subreg:QI
-	  (match_op_dup 4
-	    [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 2) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (strict_low_part (match_dup 0))
 	   (any_logic:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 1) (const_int 8) (const_int 8)) 0)
 	     (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -13223,10 +13166,8 @@
   [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn")
 	(any_logic:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)
 	  (match_operand:QI 1 "nonimmediate_operand" "0")))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -13239,29 +13180,25 @@
   [(set (match_operand:QI 0 "register_operand" "=&Q")
 	(any_logic:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "#"
   "&& reload_completed"
   [(set (match_dup 0)
 	(subreg:QI
-	  (match_op_dup 4
-	    [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 2) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (match_dup 0)
 	   (any_logic:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 1) (const_int 8) (const_int 8)) 0)
 	   (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -13291,10 +13228,8 @@
 	(subreg:SWI248
 	  (any_logic:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	    (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -13313,8 +13248,8 @@
 	   (subreg:SWI248
 	     (any_logic:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -13328,10 +13263,8 @@
 	(match_operator 5 "compare_operator"
 	  [(any_logic:QI
 	     (subreg:QI
-	       (match_operator:SWI248 3 "extract_operator"
-		 [(match_operand 1 "int248_register_operand" "0,!Q")
-		  (const_int 8)
-		  (const_int 8)]) 0)
+	       (match_operator:SWI248 3 "extract_high_operator"
+		 [(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	     (match_operand:QI 2 "general_operand" "QnBn,QnBn"))
 	  (const_int 0)]))
    (set (zero_extract:SWI248
@@ -13341,8 +13274,8 @@
 	(subreg:SWI248
 	  (any_logic:QI
 	    (subreg:QI
-	      (match_op_dup 3
-		[(match_dup 0) (const_int 8) (const_int 8)]) 0)
+	      (zero_extract:SWI248
+		(match_dup 0) (const_int 8) (const_int 8)) 0)
 	    (match_dup 2)) 0))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "@
@@ -13358,9 +13291,9 @@
      [(set (match_dup 4)
 	   (match_op_dup 5
 	     [(any_logic:QI
-	        (subreg:QI
-		  (match_op_dup 3
-		    [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		(subreg:QI
+		  (zero_extract:SWI248
+		    (match_dup 0) (const_int 8) (const_int 8)) 0)
 		(match_dup 2))
 	      (const_int 0)]))
       (set (zero_extract:SWI248
@@ -13368,8 +13301,8 @@
 	   (subreg:SWI248
 	     (any_logic:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 1) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))])]
   ""
   [(set_attr "addr" "gpr8")
@@ -13385,15 +13318,11 @@
 	(subreg:SWI248
 	  (any_logic:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "%0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "%0,!Q")]) 0)
 	    (subreg:QI
-	      (match_operator:SWI248 4 "extract_operator"
-		[(match_operand 2 "int248_register_operand" "Q,Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)) 0))
+	      (match_operator:SWI248 4 "extract_high_operator"
+		[(match_operand 2 "int248_register_operand" "Q,Q")]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "@
@@ -13412,11 +13341,11 @@
 	   (subreg:SWI248
 	     (any_logic:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (subreg:QI
-		 (match_op_dup 4
-		   [(match_dup 2) (const_int 8) (const_int 8)]) 0)) 0))
+		 (zero_extract:SWI248
+		   (match_dup 2) (const_int 8) (const_int 8)) 0)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
@@ -13428,12 +13357,10 @@
 	  (match_operand 0 "int248_register_operand" "+Q,&Q")
 	  (const_int 8)
 	  (const_int 8))
-	(match_operator:SWI248 3 "extract_operator"
+	(match_operator:SWI248 3 "extract_high_operator"
 	  [(any_logic
 	     (match_operand 1 "int248_register_operand" "%0,!Q")
-	     (match_operand 2 "int248_register_operand" "Q,Q"))
-	   (const_int 8)
-	   (const_int 8)]))
+	     (match_operand 2 "int248_register_operand" "Q,Q"))]))
    (clobber (reg:CC FLAGS_REG))]
   "GET_MODE (operands[1]) == GET_MODE (operands[2])"
   "@
@@ -13449,9 +13376,9 @@
    (parallel
      [(set (zero_extract:SWI248
 	     (match_dup 0) (const_int 8) (const_int 8))
-	   (match_op_dup 3
-	     [(any_logic (match_dup 4) (match_dup 2))
-	      (const_int 8) (const_int 8)]))
+	   (zero_extract:SWI248
+	     (any_logic (match_dup 4) (match_dup 2))
+	     (const_int 8) (const_int 8)))
       (clobber (reg:CC FLAGS_REG))])]
   "operands[4] = gen_lowpart (GET_MODE (operands[1]), operands[0]);"
   [(set_attr "type" "alu")
@@ -14696,10 +14623,8 @@
 	(subreg:SWI248
 	  (neg:QI
 	    (subreg:QI
-	      (match_operator:SWI248 2 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)) 0))
+	      (match_operator:SWI248 2 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "@
@@ -14717,8 +14642,8 @@
 	   (subreg:SWI248
 	     (neg:QI
 	       (subreg:QI
-		 (match_op_dup 2
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)) 0))
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "negnot")
@@ -15350,13 +15275,9 @@
 	  (match_operand 0 "int248_register_operand" "+Q,&Q")
 	  (const_int 8)
 	  (const_int 8))
-	(subreg:SWI248
-	  (not:QI
-	    (subreg:QI
-	      (match_operator:SWI248 2 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)) 0))]
+	(not:SWI248
+	  (match_operator:SWI248 2 "extract_high_operator"
+	    [(match_operand 1 "int248_register_operand" "0,!Q")])))]
   ""
   "@
    not{b}\t%h0
@@ -15369,11 +15290,8 @@
 	  (match_dup 1) (const_int 8) (const_int 8)))
    (set (zero_extract:SWI248
 	  (match_dup 0) (const_int 8) (const_int 8))
-	(subreg:SWI248
-	  (not:QI
-	    (subreg:QI
-	      (match_op_dup 2
-		[(match_dup 0) (const_int 8) (const_int 8)]) 0)) 0))]
+	(not:SWI248
+	  (zero_extract:SWI248 (match_dup 0) (const_int 8) (const_int 8))))]
   ""
   [(set_attr "type" "negnot")
    (set_attr "mode" "QI")])
@@ -16720,10 +16638,8 @@
 	(subreg:SWI248
 	  (ashift:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	    (match_operand:QI 2 "nonmemory_operand" "cI,cI")) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -16757,8 +16673,8 @@
 	   (subreg:SWI248
 	     (ashift:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -18004,10 +17920,8 @@
 	(subreg:SWI248
 	  (any_shiftrt:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	    (match_operand:QI 2 "nonmemory_operand" "cI,cI")) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -18033,8 +17947,8 @@
 	   (subreg:SWI248
 	     (any_shiftrt:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -18388,17 +18302,17 @@
 	(any_rotate:SWI
 	  (match_operand:SWI 1 "const_int_operand")
 	  (subreg:QI
-	    (and
-	      (match_operand 2 "int248_register_operand")
-	      (match_operand 3 "const_int_operand")) 0)))]
+	    (match_operator 4 "and_operator"
+	      [(match_operand 2 "int248_register_operand")
+	       (match_operand 3 "const_int_operand")]) 0)))]
  "(INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode) - 1))
    == GET_MODE_BITSIZE (<MODE>mode) - 1"
- [(set (match_dup 4) (match_dup 1))
+ [(set (match_dup 5) (match_dup 1))
   (set (match_dup 0)
-       (any_rotate:SWI (match_dup 4)
+       (any_rotate:SWI (match_dup 5)
 		       (subreg:QI
-			 (and:SI (match_dup 2) (match_dup 3)) 0)))]
- "operands[4] = gen_reg_rtx (<MODE>mode);")
+			 (match_op_dup 4 [(match_dup 2) (match_dup 3)]) 0)))]
+ "operands[5] = gen_reg_rtx (<MODE>mode);")
 
 (define_insn_and_split "*<insn><mode>3_mask_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand")
@@ -23243,6 +23157,7 @@
   return "call\t{*%p2@GOTPCREL(%%rip)|[QWORD PTR %p2@GOTPCREL[rip]]}";
 }
   [(set_attr "type" "multi")
+   (set_attr "tls64" "gd")
    (set (attr "length")
 	(symbol_ref "TARGET_X32 ? 15 : 16"))])
 
@@ -23281,7 +23196,11 @@
 	       UNSPEC_TLS_GD)
      (clobber (match_operand:P 3 "register_operand"))])]
   "TARGET_64BIT"
-  "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
+{
+  if (ix86_tls_descriptor_calls_expanded_in_cfun)
+    cfun->machine->tls_descriptor_call_multiple_p = true;
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
 
 (define_insn "*tls_local_dynamic_base_32_gnu"
   [(set (match_operand:SI 0 "register_operand" "=a")
@@ -23343,6 +23262,7 @@
   return "call\t{*%p1@GOTPCREL(%%rip)|[QWORD PTR %p1@GOTPCREL[rip]]}";
 }
   [(set_attr "type" "multi")
+   (set_attr "tls64" "ld_base")
    (set_attr "length" "12")])
 
 (define_insn "*tls_local_dynamic_base_64_largepic"
@@ -23376,7 +23296,11 @@
       (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE)
       (clobber (match_operand:P 2 "register_operand"))])]
   "TARGET_64BIT"
-  "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
+{
+  if (ix86_tls_descriptor_calls_expanded_in_cfun)
+    cfun->machine->tls_descriptor_call_multiple_p = true;
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
 
 ;; Local dynamic of a single variable is a lose.  Show combine how
 ;; to convert that back to global dynamic.
@@ -23570,6 +23494,8 @@
   "TARGET_64BIT && TARGET_GNU2_TLS"
 {
   operands[2] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0];
+  if (ix86_tls_descriptor_calls_expanded_in_cfun)
+    cfun->machine->tls_descriptor_call_multiple_p = true;
   ix86_tls_descriptor_calls_expanded_in_cfun = true;
 })
 
@@ -23581,6 +23507,7 @@
   "lea%z0\t{%E1@TLSDESC(%%rip), %0|%0, %E1@TLSDESC[rip]}"
   [(set_attr "type" "lea")
    (set_attr "mode" "<MODE>")
+   (set_attr "tls64" "lea")
    (set_attr "length" "7")
    (set_attr "length_address" "4")])
 
@@ -23594,6 +23521,7 @@
   "TARGET_64BIT && TARGET_GNU2_TLS"
   "call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}"
   [(set_attr "type" "call")
+   (set_attr "tls64" "call")
    (set_attr "length" "2")
    (set_attr "length_address" "0")])
 
@@ -23615,7 +23543,8 @@
 {
   operands[4] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0];
   emit_insn (gen_tls_dynamic_gnu2_64 (ptr_mode, operands[4], operands[1]));
-})
+}
+  [(set_attr "tls64" "combine")])
 
 (define_split
   [(match_operand 0 "tls_address_pattern")]
@@ -28251,10 +28180,8 @@
 	(match_operator 1 "compare_operator"
 	  [(and:QI
 	     (subreg:QI
-	       (match_operator:SWI248 4 "extract_operator"
-		 [(match_operand 2 "int248_register_operand")
-		  (const_int 8)
-		  (const_int 8)]) 0)
+	       (match_operator:SWI248 4 "extract_high_operator"
+		 [(match_operand 2 "int248_register_operand")]) 0)
 	     (match_operand 3 "const_int_operand"))
 	   (const_int 0)]))]
   "! TARGET_PARTIAL_REG_STALL
@@ -28266,9 +28193,9 @@
 	   (match_op_dup 1
 	     [(and:QI
 		(subreg:QI
-		  (match_op_dup 4 [(match_dup 2)
-				   (const_int 8)
-				   (const_int 8)]) 0)
+		  (zero_extract:SWI248 (match_dup 2)
+				       (const_int 8)
+				       (const_int 8)) 0)
 		(match_dup 3))
 	      (const_int 0)]))
       (set (zero_extract:SWI248 (match_dup 2)
@@ -28277,9 +28204,9 @@
 	   (subreg:SWI248
 	     (and:QI
 	       (subreg:QI
-		 (match_op_dup 4 [(match_dup 2)
-				  (const_int 8)
-				  (const_int 8)]) 0)
+		 (zero_extract:SWI248 (match_dup 2)
+				      (const_int 8)
+				      (const_int 8)) 0)
 	       (match_dup 3)) 0))])])
 
 ;; Don't do logical operations with memory inputs.
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c93c0b1..6bda22f 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1246,6 +1246,10 @@ munroll-only-small-loops
 Target Var(ix86_unroll_only_small_loops) Init(0) Optimization
 Enable conservative small loop unrolling.
 
+-param=ix86-vect-unroll-limit=
+Target Joined UInteger Var(ix86_vect_unroll_limit) Init(4) Param
+Limit how much the autovectorizer may unroll a loop.
+
 mlam=
 Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none)
 -mlam=[none|u48|u57] Instrument meta data position in user data pointers.
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index b2d2eec..5dbe444 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1319,6 +1319,9 @@
   (ior (match_operand 0 "nonimmediate_operand")
        (match_test "const_vec_duplicate_p (op)")))
 
+(define_predicate "const_vec_dup_operand"
+       (match_test "const_vec_duplicate_p (op)"))
+
 ;; Return true when OP is either register operand, or any
 ;; CONST_VECTOR.
 (define_predicate "reg_or_const_vector_operand"
@@ -1714,10 +1717,14 @@
 (define_predicate "div_operator"
   (match_code "div"))
 
-;; Return true if this is a and, ior or xor operation.
+;; Return true if this is an and, ior or xor operation.
 (define_predicate "logic_operator"
   (match_code "and,ior,xor"))
 
+;; Return true if this is an and operation.
+(define_predicate "and_operator"
+  (match_code "and"))
+
 ;; Return true if this is a plus, minus, and, ior or xor operation.
 (define_predicate "plusminuslogic_operator"
   (match_code "plus,minus,and,ior,xor"))
@@ -1740,8 +1747,12 @@
 (define_predicate "compare_operator"
   (match_code "compare"))
 
-(define_predicate "extract_operator"
-  (match_code "zero_extract,sign_extract"))
+(define_predicate "extract_high_operator"
+  (match_code "zero_extract,sign_extract,ashiftrt,lshiftrt")
+{
+  return (const8_operand (XEXP (op, 1), VOIDmode)
+	  && (BINARY_P (op) || const8_operand (XEXP (op, 2), VOIDmode)));
+})
 
 ;; Return true if OP is a memory operand, aligned to
 ;; less than its natural alignment.
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d88c3d6..73906b8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -326,6 +326,9 @@
 (define_mode_iterator VI1_AVX512VL
   [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")])
 
+(define_mode_iterator VI1_AVX512_3264
+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX")])
+
 ;; All vector modes
 (define_mode_iterator V
   [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
@@ -21729,6 +21732,19 @@
 	   (const_string "orig")))
    (set_attr "mode" "TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
 
+;; Eliminate redundancy caused by
+;; /* Special case TImode to 128-bit vector conversions via V2DI.  */
+;; in ix86_expand_vector_move
+
+(define_split
+  [(set (match_operand:V2DI 0 "register_operand")
+	(vec_concat:V2DI
+	  (subreg:DI (match_operand:TI 1 "register_operand") 0)
+	  (subreg:DI (match_dup 1) 8)))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  [(set (match_dup 0)
+	(subreg:V2DI (match_dup 1) 0))])
+
 (define_insn "*vec_concatv2di_0"
   [(set (match_operand:V2DI 0 "register_operand"     "=v,v ,x")
 	(vec_concat:V2DI
@@ -26546,9 +26562,9 @@
 
 ;; XOP packed rotate instructions
 (define_expand "rotl<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-	(rotate:VI_128
-	 (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+	(rotate:VI248_128
+	 (match_operand:VI248_128 1 "nonimmediate_operand")
 	 (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
@@ -26577,9 +26593,9 @@
 })
 
 (define_expand "rotr<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-	(rotatert:VI_128
-	 (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+	(rotatert:VI248_128
+	 (match_operand:VI248_128 1 "nonimmediate_operand")
 	 (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
@@ -26951,31 +26967,122 @@
       int i;
 
       if (<CODE> != ASHIFT)
-	{
-	  if (CONST_INT_P (operands[2]))
-	    operands[2] = GEN_INT (-INTVAL (operands[2]));
-	  else
-	    negate = true;
-	}
+       {
+	     if (CONST_INT_P (operands[2]))
+	       operands[2] = GEN_INT (-INTVAL (operands[2]));
+	     else
+	       negate = true;
+	   }
       par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
       tmp = lowpart_subreg (QImode, operands[2], SImode);
       for (i = 0; i < 16; i++)
-	XVECEXP (par, 0, i) = tmp;
+        XVECEXP (par, 0, i) = tmp;
 
       tmp = gen_reg_rtx (V16QImode);
       emit_insn (gen_vec_initv16qiqi (tmp, par));
 
       if (negate)
-	emit_insn (gen_negv16qi2 (tmp, tmp));
+        emit_insn (gen_negv16qi2 (tmp, tmp));
 
       gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
       emit_insn (gen (operands[0], operands[1], tmp));
     }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2])
+           && (<MODE_SIZE> == 64
+               || !(INTVAL (operands[2]) == 7 && <CODE> == ASHIFTRT)))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2],
+						   <CODE>);
+      emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+					    const0_rtx));
+    }
   else
     ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
   DONE;
 })
 
+(define_expand "cond_<insn><mode>"
+  [(set (match_operand:VI1_AVX512VL 0 "register_operand")
+	(vec_merge:VI1_AVX512VL
+	  (any_shift:VI1_AVX512VL
+	    (match_operand:VI1_AVX512VL 2 "register_operand")
+	    (match_operand:VI1_AVX512VL 3 "const_vec_dup_operand"))
+	  (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand")
+	(match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_GFNI && TARGET_AVX512F"
+{
+  rtx count = XVECEXP (operands[3], 0, 0);
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], count, <CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[2], matrix,
+					     const0_rtx, operands[4],
+					     operands[1]));
+  DONE;
+})
+
+(define_expand "<insn><mode>3"
+  [(set (match_operand:VI1_AVX512_3264 0 "register_operand")
+	(any_rotate:VI1_AVX512_3264
+	  (match_operand:VI1_AVX512_3264 1 "register_operand")
+	  (match_operand:SI 2 "const_int_operand")))]
+  "TARGET_GFNI"
+{
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+             const0_rtx));
+  DONE;
+})
+
+(define_expand "<insn>v16qi3"
+  [(set (match_operand:V16QI 0 "register_operand")
+     (any_rotate:V16QI
+       (match_operand:V16QI 1 "nonimmediate_operand")
+       (match_operand:SI 2 "general_operand")))]
+  "TARGET_GFNI || TARGET_XOP"
+{
+  /* Handle the V16QI XOP case to avoid a conflict with the other expand.  */
+  if (TARGET_XOP)
+    {
+      if (! const_0_to_7_operand (operands[2], SImode))
+        {
+          rtvec vs = rtvec_alloc (16);
+          rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+          rtx reg = gen_reg_rtx (V16QImode);
+          rtx op2 = operands[2];
+          int i;
+
+          if (GET_MODE (op2) != QImode)
+            {
+              op2 = gen_reg_rtx (QImode);
+              convert_move (op2, operands[2], false);
+            }
+
+          for (i = 0; i < 16; i++)
+            RTVEC_ELT (vs, i) = op2;
+
+          emit_insn (gen_vec_initv16qiqi (reg, par));
+          if (<CODE> == ROTATERT)
+            {
+              rtx neg = gen_reg_rtx (V16QImode);
+              emit_insn (gen_negv16qi2 (neg, reg));
+              emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg));
+              reg = neg;
+            }
+          emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg));
+          DONE;
+       }
+    }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2]))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+      emit_insn (gen_vgf2p8affineqb_v16qi (operands[0],
+					   force_reg (V16QImode, operands[1]),
+					   matrix, const0_rtx));
+      DONE;
+    }
+  else
+    FAIL;
+})
+
 (define_expand "ashrv2di3"
   [(set (match_operand:V2DI 0 "register_operand")
 	(ashiftrt:V2DI
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index c8603b9..1649ea2 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -141,6 +141,12 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   COSTS_N_BYTES (4),			/* cost of CVT(T)PS2PI instruction.  */
   
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   ix86_size_memcpy,
   ix86_size_memset,
   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
@@ -261,6 +267,12 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (27),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (27),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   i386_memcpy,
   i386_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -382,6 +394,12 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (27),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (27),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   i486_memcpy,
   i486_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -501,6 +519,12 @@ struct processor_costs pentium_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -613,6 +637,12 @@ struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (5),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (5),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -740,6 +770,12 @@ struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -858,6 +894,12 @@ struct processor_costs geode_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   geode_memcpy,
   geode_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -979,6 +1021,12 @@ struct processor_costs k6_cost = {
   COSTS_N_INSNS (2),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (2),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   k6_memcpy,
   k6_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1101,6 +1149,12 @@ struct processor_costs athlon_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   athlon_memcpy,
   athlon_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1232,6 +1286,12 @@ struct processor_costs k8_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (5),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   k8_memcpy,
   k8_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1371,6 +1431,12 @@ struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   amdfam10_memcpy,
   amdfam10_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -1503,6 +1569,12 @@ const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   bdver_memcpy,
   bdver_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1668,6 +1740,12 @@ struct processor_costs znver1_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {5, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver1_memcpy,
   znver1_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1836,6 +1914,12 @@ struct processor_costs znver2_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {10, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1979,6 +2063,12 @@ struct processor_costs znver3_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 6},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2125,6 +2215,12 @@ struct processor_costs znver4_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 6},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2287,6 +2383,12 @@ struct processor_costs znver5_cost = {
 	We increase width to 6 for multiplications
 	in ix86_reassociation_width.  */
   6, 6, 4, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 6},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2422,6 +2524,12 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (7),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   skylake_memcpy,
   skylake_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2559,6 +2667,12 @@ struct processor_costs icelake_cost = {
   COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 10, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   icelake_memcpy,
   icelake_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2690,6 +2804,12 @@ struct processor_costs alderlake_cost = {
   COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   alderlake_memcpy,
   alderlake_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2814,6 +2934,12 @@ const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   btver1_memcpy,
   btver1_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -2935,6 +3061,12 @@ const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   btver2_memcpy,
   btver2_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -3055,6 +3187,12 @@ struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (12),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentium4_memcpy,
   pentium4_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3178,6 +3316,12 @@ struct processor_costs nocona_cost = {
   COSTS_N_INSNS (12),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   nocona_memcpy,
   nocona_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3299,6 +3443,12 @@ struct processor_costs atom_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  2,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   atom_memcpy,
   atom_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3420,6 +3570,12 @@ struct processor_costs slm_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   slm_memcpy,
   slm_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3555,6 +3711,12 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   tremont_memcpy,
   tremont_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3681,6 +3843,12 @@ struct processor_costs lujiazui_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   lujiazui_memcpy,
   lujiazui_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3805,6 +3973,12 @@ struct processor_costs yongfeng_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   yongfeng_memcpy,
   yongfeng_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3929,6 +4103,12 @@ struct processor_costs shijidadao_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   shijidadao_memcpy,
   shijidadao_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -4078,6 +4258,12 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   generic_memcpy,
   generic_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -4215,6 +4401,12 @@ struct processor_costs core_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (7),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   core_memcpy,
   core_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
diff --git a/gcc/config/loongarch/genopts/isa-evolution.in b/gcc/config/loongarch/genopts/isa-evolution.in
index 50f72d5..836d93a 100644
--- a/gcc/config/loongarch/genopts/isa-evolution.in
+++ b/gcc/config/loongarch/genopts/isa-evolution.in
@@ -2,4 +2,5 @@
 2	26	div32		1.1		Support div.w[u] and mod.w[u] instructions with inputs not sign-extended.
 2	27	lam-bh		1.1		Support am{swap/add}[_db].{b/h} instructions.
 2	28	lamcas		1.1		Support amcas[_db].{b/h/w/d} instructions.
+2	30	scq		1.1		Support sc.q instruction.
 3	23	ld-seq-sa	1.1		Do not need load-load barriers (dbar 0x700).
diff --git a/gcc/config/loongarch/loongarch-def.cc b/gcc/config/loongarch/loongarch-def.cc
index 04b277e..dcd8d90 100644
--- a/gcc/config/loongarch/loongarch-def.cc
+++ b/gcc/config/loongarch/loongarch-def.cc
@@ -72,7 +72,7 @@ array_arch<loongarch_isa> loongarch_cpu_default_isa =
 	    .simd_ (ISA_EXT_SIMD_LASX)
 	    .evolution_ (OPTION_MASK_ISA_DIV32 | OPTION_MASK_ISA_LD_SEQ_SA
 			 | OPTION_MASK_ISA_LAM_BH | OPTION_MASK_ISA_LAMCAS
-			 | OPTION_MASK_ISA_FRECIPE))
+			 | OPTION_MASK_ISA_FRECIPE | OPTION_MASK_ISA_SCQ))
     .set (ARCH_LA64V1_0,
 	  loongarch_isa ()
 	    .base_ (ISA_BASE_LA64)
@@ -86,7 +86,7 @@ array_arch<loongarch_isa> loongarch_cpu_default_isa =
 	    .simd_ (ISA_EXT_SIMD_LSX)
 	    .evolution_ (OPTION_MASK_ISA_DIV32 | OPTION_MASK_ISA_LD_SEQ_SA
 			 | OPTION_MASK_ISA_LAM_BH | OPTION_MASK_ISA_LAMCAS
-			 | OPTION_MASK_ISA_FRECIPE));
+			 | OPTION_MASK_ISA_FRECIPE | OPTION_MASK_ISA_SCQ));
 
 
 static inline loongarch_cache la464_cache ()
diff --git a/gcc/config/loongarch/loongarch-def.h b/gcc/config/loongarch/loongarch-def.h
index 0bcd2a7..0a7d0c9 100644
--- a/gcc/config/loongarch/loongarch-def.h
+++ b/gcc/config/loongarch/loongarch-def.h
@@ -78,12 +78,10 @@ extern loongarch_def_array<const char *, N_ISA_EXT_TYPES>
 
 
 /* Base ABI */
-enum {
-  ABI_BASE_LP64D	= 0,
-  ABI_BASE_LP64F	= 1,
-  ABI_BASE_LP64S	= 2,
-  N_ABI_BASE_TYPES	= 3
-};
+#define ABI_BASE_LP64D	  0
+#define ABI_BASE_LP64F	  1
+#define ABI_BASE_LP64S	  2
+#define N_ABI_BASE_TYPES  3
 
 extern loongarch_def_array<const char *, N_ABI_BASE_TYPES>
   loongarch_abi_base_strings;
diff --git a/gcc/config/loongarch/loongarch-evolution.cc b/gcc/config/loongarch/loongarch-evolution.cc
index de68624..a92a645 100644
--- a/gcc/config/loongarch/loongarch-evolution.cc
+++ b/gcc/config/loongarch/loongarch-evolution.cc
@@ -32,6 +32,7 @@ int la_evo_feature_masks[] = {
   OPTION_MASK_ISA_DIV32,
   OPTION_MASK_ISA_LAM_BH,
   OPTION_MASK_ISA_LAMCAS,
+  OPTION_MASK_ISA_SCQ,
   OPTION_MASK_ISA_LD_SEQ_SA,
 };
 
@@ -40,6 +41,7 @@ const char* la_evo_macro_name[] = {
   "__loongarch_div32",
   "__loongarch_lam_bh",
   "__loongarch_lamcas",
+  "__loongarch_scq",
   "__loongarch_ld_seq_sa",
 };
 
@@ -48,6 +50,7 @@ int la_evo_version_major[] = {
   1,    /* DIV32 */
   1,    /* LAM_BH */
   1,    /* LAMCAS */
+  1,    /* SCQ */
   1,    /* LD_SEQ_SA */
 };
 
@@ -56,5 +59,6 @@ int la_evo_version_minor[] = {
   1,    /* DIV32 */
   1,    /* LAM_BH */
   1,    /* LAMCAS */
+  1,    /* SCQ */
   1,    /* LD_SEQ_SA */
 };
diff --git a/gcc/config/loongarch/loongarch-evolution.h b/gcc/config/loongarch/loongarch-evolution.h
index 5f90839..7fb7b0d 100644
--- a/gcc/config/loongarch/loongarch-evolution.h
+++ b/gcc/config/loongarch/loongarch-evolution.h
@@ -36,6 +36,7 @@ static constexpr struct {
   { 2, 1u << 26, OPTION_MASK_ISA_DIV32 },
   { 2, 1u << 27, OPTION_MASK_ISA_LAM_BH },
   { 2, 1u << 28, OPTION_MASK_ISA_LAMCAS },
+  { 2, 1u << 30, OPTION_MASK_ISA_SCQ },
   { 3, 1u << 23, OPTION_MASK_ISA_LD_SEQ_SA },
 };
 
@@ -58,8 +59,9 @@ enum {
   EVO_DIV32 = 1,
   EVO_LAM_BH = 2,
   EVO_LAMCAS = 3,
-  EVO_LD_SEQ_SA = 4,
-  N_EVO_FEATURES = 5
+  EVO_SCQ = 4,
+  EVO_LD_SEQ_SA = 5,
+  N_EVO_FEATURES = 6
 };
 
 /* Condition macros */
@@ -71,6 +73,8 @@ enum {
   (la_target.isa.evolution & OPTION_MASK_ISA_LAM_BH)
 #define ISA_HAS_LAMCAS \
   (la_target.isa.evolution & OPTION_MASK_ISA_LAMCAS)
+#define ISA_HAS_SCQ \
+  (la_target.isa.evolution & OPTION_MASK_ISA_SCQ)
 #define ISA_HAS_LD_SEQ_SA \
   (la_target.isa.evolution & OPTION_MASK_ISA_LD_SEQ_SA)
 
diff --git a/gcc/config/loongarch/loongarch-str.h b/gcc/config/loongarch/loongarch-str.h
index 1546ea3..583cce8 100644
--- a/gcc/config/loongarch/loongarch-str.h
+++ b/gcc/config/loongarch/loongarch-str.h
@@ -70,6 +70,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTSTR_DIV32	"div32"
 #define OPTSTR_LAM_BH	"lam-bh"
 #define OPTSTR_LAMCAS	"lamcas"
+#define OPTSTR_SCQ	"scq"
 #define OPTSTR_LD_SEQ_SA	"ld-seq-sa"
 
 #endif /* LOONGARCH_STR_H */
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 493f95e..0935d7b 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -4388,6 +4388,7 @@ loongarch_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	    break;
 	  }
       else if (TARGET_RECIP_VEC_DIV
+	       && vectype
 	       && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
 	{
 	  machine_mode mode = TYPE_MODE (vectype);
@@ -6221,9 +6222,8 @@ loongarch_print_operand_reloc (FILE *file, rtx op, bool hi64_part,
    'Q'  Print R_LARCH_RELAX for TLS IE.
    'r'  Print address 12-31bit relocation associated with OP.
    'R'  Print address 32-51bit relocation associated with OP.
-   'T'	Print 'f' for (eq:CC ...), 't' for (ne:CC ...),
-	      'z' for (eq:?I ...), 'n' for (ne:?I ...).
-   't'	Like 'T', but with the EQ/NE cases reversed
+   'T'	Print a comment marker if %G outputs nothing.
+   't'	Print the register containing the higher 64 bits of a TImode.
    'u'	Print a LASX register.
    'v'	Print the insn size suffix b, h, w or d for vector modes V16QI, V8HI,
 	  V4SI, V2SI, and w, d for vector modes V4SF, V2DF respectively.
@@ -6306,6 +6306,13 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
 	fputs ("dbar\t0x700", file);
       break;
 
+    case 'T':
+      if (!loongarch_cas_failure_memorder_needs_acquire (
+	    memmodel_from_int (INTVAL (op)))
+	  && ISA_HAS_LD_SEQ_SA)
+	fprintf (file, "%s", ASM_COMMENT_START);
+      break;
+
     case 'h':
       if (code == HIGH)
 	op = XEXP (op, 0);
@@ -6384,14 +6391,6 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
 				     false /* lo_reloc */);
       break;
 
-    case 't':
-    case 'T':
-      {
-	int truth = (code == NE) == (letter == 'T');
-	fputc ("zfnt"[truth * 2 + FCC_REG_P (REGNO (XEXP (op, 0)))], file);
-      }
-      break;
-
     case 'V':
       if (CONST_VECTOR_P (op))
 	{
@@ -6495,6 +6494,16 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
 	}
       break;
 
+    case 't':
+      if (GET_MODE (op) != TImode
+	  || (op != CONST0_RTX (TImode) && code != REG))
+	{
+	  output_operand_lossage ("invalid use of '%%%c'", letter);
+	  break;
+	}
+      op = loongarch_subword (op, 1);
+      letter = 'z';
+      /* fall through */
     default:
       switch (code)
 	{
@@ -10786,9 +10795,9 @@ loongarch_expand_vec_cmp (rtx operands[])
    to a fixed type.  */
 
 static machine_mode
-loongarch_promote_function_mode (const_tree type ATTRIBUTE_UNUSED,
+loongarch_promote_function_mode (const_tree type,
 				 machine_mode mode,
-				 int *punsignedp ATTRIBUTE_UNUSED,
+				 int *punsignedp,
 				 const_tree fntype ATTRIBUTE_UNUSED,
 				 int for_return ATTRIBUTE_UNUSED)
 {
@@ -11154,6 +11163,46 @@ loongarch_c_mode_for_suffix (char suffix)
   return VOIDmode;
 }
 
+/* Implement TARGET_C_BITINT_TYPE_INFO.
+   Return true if _BitInt(N) is supported and fill its details into *INFO.  */
+bool
+loongarch_bitint_type_info (int n, struct bitint_info *info)
+{
+  if (n <= 8)
+    info->limb_mode = QImode;
+  else if (n <= 16)
+    info->limb_mode = HImode;
+  else if (n <= 32)
+    info->limb_mode = SImode;
+  else if (n <= 64)
+    info->limb_mode = DImode;
+  else if (n <= 128)
+    info->limb_mode = TImode;
+  else
+    info->limb_mode = DImode;
+
+  info->abi_limb_mode = info->limb_mode;
+
+  if (n > 64)
+    info->abi_limb_mode = TImode;
+
+  info->big_endian = false;
+  info->extended = true;
+  return true;
+}
+
+/* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
+
+static int
+loongarch_compute_pressure_classes (reg_class *classes)
+{
+  int i = 0;
+  classes[i++] = GENERAL_REGS;
+  classes[i++] = FP_REGS;
+  classes[i++] = FCC_REGS;
+  return i;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -11428,6 +11477,12 @@ loongarch_c_mode_for_suffix (char suffix)
 #undef TARGET_C_MODE_FOR_SUFFIX
 #define TARGET_C_MODE_FOR_SUFFIX loongarch_c_mode_for_suffix
 
+#undef TARGET_C_BITINT_TYPE_INFO
+#define TARGET_C_BITINT_TYPE_INFO loongarch_bitint_type_info
+
+#undef TARGET_COMPUTE_PRESSURE_CLASSES
+#define TARGET_COMPUTE_PRESSURE_CLASSES loongarch_compute_pressure_classes
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-loongarch.h"
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index d897763..e8819bf 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -270,7 +270,9 @@ along with GCC; see the file COPYING3.  If not see
   if (GET_MODE_CLASS (MODE) == MODE_INT \
       && GET_MODE_SIZE (MODE) < UNITS_PER_WORD) \
     { \
-      if ((MODE) == SImode) \
+      if ((MODE) == SImode \
+	  && !(TYPE && TREE_CODE (TYPE) == BITINT_TYPE \
+	       && TYPE_PRECISION (TYPE) < 32)) \
 	(UNSIGNEDP) = 0; \
       (MODE) = Pmode; \
     }
@@ -823,8 +825,6 @@ typedef struct {
 
 #define CASE_VECTOR_MODE Pmode
 
-#define CASE_VECTOR_SHORTEN_MODE(MIN, MAX, BODY) Pmode
-
 /* Define this as 1 if `char' should by default be signed; else as 0.  */
 #ifndef DEFAULT_SIGNED_CHAR
 #define DEFAULT_SIGNED_CHAR 1
diff --git a/gcc/config/loongarch/loongarch.opt b/gcc/config/loongarch/loongarch.opt
index 4d85cf5..fbe61c0 100644
--- a/gcc/config/loongarch/loongarch.opt
+++ b/gcc/config/loongarch/loongarch.opt
@@ -334,6 +334,10 @@ mlamcas
 Target Mask(ISA_LAMCAS) Var(la_isa_evolution)
 Support amcas[_db].{b/h/w/d} instructions.
 
+mscq
+Target Mask(ISA_SCQ) Var(la_isa_evolution)
+Support sc.q instruction.
+
 mld-seq-sa
 Target Mask(ISA_LD_SEQ_SA) Var(la_isa_evolution)
 Do not need load-load barriers (dbar 0x700).
diff --git a/gcc/config/loongarch/loongarch.opt.urls b/gcc/config/loongarch/loongarch.opt.urls
index 5f644f6..606a211 100644
--- a/gcc/config/loongarch/loongarch.opt.urls
+++ b/gcc/config/loongarch/loongarch.opt.urls
@@ -90,6 +90,9 @@ UrlSuffix(gcc/LoongArch-Options.html#index-mlam-bh)
 mlamcas
 UrlSuffix(gcc/LoongArch-Options.html#index-mlamcas)
 
+mscq
+UrlSuffix(gcc/LoongArch-Options.html#index-mscq)
+
 mld-seq-sa
 UrlSuffix(gcc/LoongArch-Options.html#index-mld-seq-sa)
 
diff --git a/gcc/config/loongarch/simd.md b/gcc/config/loongarch/simd.md
index dd17cd1..4156b26 100644
--- a/gcc/config/loongarch/simd.md
+++ b/gcc/config/loongarch/simd.md
@@ -773,7 +773,7 @@
 	      (vec_select:<VEC_HALF>
 		(match_operand:IVEC 2 "register_operand" "f")
 		(match_operand:IVEC 4 "vect_par_cnst_even_or_odd_half")))
-	    (any_extend:<WVEC>
+	    (any_extend:<WVEC_HALF>
 	      (vec_select:<VEC_HALF>
 		(match_operand:IVEC 3 "register_operand" "f")
 		(match_dup 4))))
diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md
index fd8d732..2ee400e 100644
--- a/gcc/config/loongarch/sync.md
+++ b/gcc/config/loongarch/sync.md
@@ -21,25 +21,25 @@
 
 (define_c_enum "unspec" [
   UNSPEC_COMPARE_AND_SWAP
+  UNSPEC_COMPARE_AND_SWAP_AMCAS
   UNSPEC_COMPARE_AND_SWAP_ADD
   UNSPEC_COMPARE_AND_SWAP_SUB
-  UNSPEC_COMPARE_AND_SWAP_AND
-  UNSPEC_COMPARE_AND_SWAP_XOR
-  UNSPEC_COMPARE_AND_SWAP_OR
   UNSPEC_COMPARE_AND_SWAP_NAND
   UNSPEC_SYNC_OLD_OP
   UNSPEC_SYNC_EXCHANGE
   UNSPEC_ATOMIC_STORE
   UNSPEC_ATOMIC_LOAD
   UNSPEC_MEMORY_BARRIER
+
+  UNSPEC_TI_FETCH_ADD
+  UNSPEC_TI_FETCH_SUB
+  UNSPEC_TI_FETCH_AND
+  UNSPEC_TI_FETCH_XOR
+  UNSPEC_TI_FETCH_OR
+  UNSPEC_TI_FETCH_NAND_MASK_INVERTED
 ])
 
 (define_code_iterator any_atomic [plus ior xor and])
-(define_code_attr atomic_optab
-  [(plus "add") (ior "or") (xor "xor") (and "and")])
-
-;; This attribute gives the format suffix for atomic memory operations.
-(define_mode_attr amo [(QI "b") (HI "h") (SI "w") (DI "d")])
 
 ;; <amop> expands to the name of the atomic operand that implements a
 ;; particular code.
@@ -107,7 +107,7 @@
 (define_insn "atomic_load<mode>"
   [(set (match_operand:QHWD 0 "register_operand" "=r")
     (unspec_volatile:QHWD
-      [(match_operand:QHWD 1 "memory_operand" "+m")
+      [(match_operand:QHWD 1 "memory_operand" "m")
        (match_operand:SI 2 "const_int_operand")]                        ;; model
       UNSPEC_ATOMIC_LOAD))]
   ""
@@ -142,9 +142,50 @@
 }
   [(set (attr "length") (const_int 12))])
 
+(define_insn "atomic_loadti_lsx"
+  [(set (match_operand:V2DI 0 "register_operand" "=f")
+	(unspec_volatile:V2DI
+          [(match_operand:TI 1 "memory_operand" "m")
+	   (match_operand:SI 2 "const_int_operand")] ;; model
+	  UNSPEC_ATOMIC_LOAD))]
+  "ISA_HAS_LSX && TARGET_64BIT"
+{
+  enum memmodel model = memmodel_base (INTVAL (operands[2]));
+
+  switch (model)
+    {
+    case MEMMODEL_SEQ_CST:
+      output_asm_insn ("dbar\t0x11", operands);
+      /* fall through */
+    case MEMMODEL_ACQUIRE:
+    case MEMMODEL_RELAXED:
+      return "vld\t%w0,%1\\n\\t%G2";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "length") (const_int 12))])
+
+(define_expand "atomic_loadti"
+  [(match_operand:TI 0 "register_operand" "=r")
+   (match_operand:TI 1 "memory_operand"   "m")
+   (match_operand:SI 2 "const_int_operand")]
+  "ISA_HAS_LSX && TARGET_64BIT"
+{
+  rtx vr = gen_reg_rtx (V2DImode);
+
+  emit_insn (gen_atomic_loadti_lsx (vr, operands[1], operands[2]));
+  for (int i = 0; i < 2; i++)
+    emit_insn (
+      gen_lsx_vpickve2gr_d (loongarch_subword (operands[0], i), vr,
+			    GEN_INT (i)));
+  DONE;
+})
+
 ;; Implement atomic stores with amoswap.  Fall back to fences for atomic loads.
 (define_insn "atomic_store<mode>"
-  [(set (match_operand:QHWD 0 "memory_operand" "+m")
+  [(set (match_operand:QHWD 0 "memory_operand" "=m")
     (unspec_volatile:QHWD
       [(match_operand:QHWD 1 "reg_or_0_operand" "rJ")
        (match_operand:SI 2 "const_int_operand")]      ;; model
@@ -175,7 +216,67 @@
 }
   [(set (attr "length") (const_int 12))])
 
-(define_insn "atomic_<atomic_optab><mode>"
+(define_insn "atomic_storeti_lsx"
+  [(set (match_operand:TI 0 "memory_operand" "=m")
+	(unspec_volatile:TI
+	  [(match_operand:V2DI 1 "register_operand" "f")
+	   (match_operand:SI   2 "const_int_operand")] ;; model
+	UNSPEC_ATOMIC_STORE))]
+  "ISA_HAS_LSX && TARGET_64BIT"
+{
+  enum memmodel model = memmodel_base (INTVAL (operands[2]));
+
+  switch (model)
+    {
+    case MEMMODEL_SEQ_CST:
+      return "dbar\t0x12\\n\\t"
+	     "vst\t%w1,%0\\n\\t"
+	     "dbar\t0x18";
+    case MEMMODEL_RELEASE:
+      return "dbar\t0x12\\n\\t"
+	     "vst\t%w1,%0";
+    case MEMMODEL_RELAXED:
+      return "vst\t%w1,%0";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "length") (const_int 12))])
+
+(define_insn "atomic_storeti_scq"
+  [(set (match_operand:TI 0 "memory_operand" "=m")
+	(unspec_volatile:TI
+	  [(match_operand:TI 1 "register_operand" "r")]
+	  UNSPEC_ATOMIC_STORE))
+   (clobber (match_scratch:DI 2 "=&r"))]
+  "TARGET_64BIT && ISA_HAS_SCQ"
+  "1:\\n\\tll.d\t$r0,%0\n\tmove\t%2,%1\n\tsc.q\t%2,%t1,%0\n\tbeqz\t%2,1b"
+  [(set (attr "length") (const_int 16))])
+
+(define_expand "atomic_storeti"
+  [(match_operand:TI 0 "memory_operand"   "=m")
+   (match_operand:TI 1 "reg_or_0_operand" "rJ")
+   (match_operand:SI 2 "const_int_operand")]
+  "TARGET_64BIT && (ISA_HAS_LSX || ISA_HAS_SCQ)"
+{
+  if (!ISA_HAS_LSX)
+    {
+      emit_insn (gen_atomic_storeti_scq (operands[0], operands[1]));
+      DONE;
+    }
+
+  rtx vr = gen_reg_rtx (V2DImode), op1 = operands[1];
+  rtvec v = rtvec_alloc (2);
+
+  for (int i = 0; i < 2; i++)
+    RTVEC_ELT (v, i) = loongarch_subword (op1, i);
+
+  emit_insn (gen_vec_initv2didi (vr, gen_rtx_PARALLEL (V2DImode, v)));
+  emit_insn (gen_atomic_storeti_lsx (operands[0], vr, operands[2]));
+  DONE;
+})
+
+(define_insn "atomic_<amop><mode>"
   [(set (match_operand:GPR 0 "memory_operand" "+ZB")
 	(unspec_volatile:GPR
 	  [(any_atomic:GPR (match_dup 0)
@@ -183,7 +284,7 @@
 	   (match_operand:SI 2 "const_int_operand")] ;; model
 	 UNSPEC_SYNC_OLD_OP))]
   ""
-  "am<amop>%A2.<amo>\t$zero,%z1,%0"
+  "am<amop>%A2.<size>\t$zero,%z1,%0"
   [(set (attr "length") (const_int 4))])
 
 (define_insn "atomic_add<mode>"
@@ -194,10 +295,10 @@
 	   (match_operand:SI 2 "const_int_operand")] ;; model
 	 UNSPEC_SYNC_OLD_OP))]
   "ISA_HAS_LAM_BH"
-  "amadd%A2.<amo>\t$zero,%z1,%0"
+  "amadd%A2.<size>\t$zero,%z1,%0"
   [(set (attr "length") (const_int 4))])
 
-(define_insn "atomic_fetch_<atomic_optab><mode>"
+(define_insn "atomic_fetch_<amop><mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")
 	(match_operand:GPR 1 "memory_operand" "+ZB"))
    (set (match_dup 1)
@@ -207,9 +308,52 @@
 	   (match_operand:SI 3 "const_int_operand")] ;; model
 	 UNSPEC_SYNC_OLD_OP))]
   ""
-  "am<amop>%A3.<amo>\t%0,%z2,%1"
+  "am<amop>%A3.<size>\t%0,%z2,%1"
   [(set (attr "length") (const_int 4))])
 
+(define_insn "atomic_fetch_nand_mask_inverted<mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=&r")
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR
+	  [(ior:GPR (not (match_dup 1))
+		    (match_operand:GPR 2 "register_operand" "r"))]
+	  UNSPEC_SYNC_OLD_OP))
+   (clobber (match_scratch:GPR 3 "=&r"))]
+  ""
+  {
+    return "1:\\n\\t"
+	   "ll.<d>\\t%0,%1\\n\\t"
+	   "orn\\t%3,%2,%0\\n\\t"
+	   "sc.<d>\\t%3,%1\\n\\t"
+	   "beqz\\t%3,1b";
+  }
+  [(set (attr "length") (const_int 16))])
+
+(define_mode_iterator ALL_SC [GPR (TI "TARGET_64BIT && ISA_HAS_SCQ")])
+(define_mode_attr _scq [(SI "") (DI "") (TI "_scq")])
+(define_expand "atomic_fetch_nand<mode>"
+  [(match_operand:ALL_SC 0 "register_operand")
+   (match_operand:ALL_SC 1 "memory_operand")
+   (match_operand:ALL_SC 2 "reg_or_0_operand")
+   (match_operand:SI     3 "const_int_operand")]
+  ""
+  {
+    /* ~(atom & mask) = (~mask) | (~atom), so we can hoist
+       (~mask) out of the ll/sc loop and use the orn instruction in the
+       ll/sc loop.  */
+    rtx inverted_mask = gen_reg_rtx (<MODE>mode);
+    emit_move_insn (inverted_mask,
+		    expand_simple_unop (<MODE>mode, NOT, operands[2],
+					NULL_RTX, false));
+
+    emit_insn (
+      gen_atomic_fetch_nand_mask_inverted<mode><_scq> (operands[0],
+						       operands[1],
+						       inverted_mask));
+    DONE;
+  })
+
 (define_insn "atomic_exchange<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")
 	(unspec_volatile:GPR
@@ -219,9 +363,44 @@
    (set (match_dup 1)
 	(match_operand:GPR 2 "register_operand" "r"))]
   ""
-  "amswap%A3.<amo>\t%0,%z2,%1"
+  "amswap%A3.<size>\t%0,%z2,%1"
   [(set (attr "length") (const_int 4))])
 
+(define_insn "atomic_exchangeti_scq"
+  [(set (match_operand:TI 0 "register_operand" "=&r")
+	(unspec_volatile:TI
+	  [(match_operand:TI 1 "memory_operand" "+ZB")]
+	  UNSPEC_SYNC_EXCHANGE))
+   (set (match_dup 1)
+	(match_operand:TI 2 "register_operand" "rJ"))
+   (clobber (match_scratch:DI 3 "=&r"))]
+  "TARGET_64BIT && ISA_HAS_SCQ"
+{
+  output_asm_insn ("1:", operands);
+  output_asm_insn ("ll.d\t%0,%1", operands);
+  if (!ISA_HAS_LD_SEQ_SA)
+    output_asm_insn ("dbar\t0x700", operands);
+  output_asm_insn ("ld.d\t%t0,%b1,8", operands);
+  output_asm_insn ("move\t%3,%z2", operands);
+  output_asm_insn ("sc.q\t%3,%t2,%1", operands);
+  output_asm_insn ("beqz\t%3,1b", operands);
+
+  return "";
+}
+  [(set (attr "length") (const_int 24))])
+
+(define_expand "atomic_exchangeti"
+  [(match_operand:TI 0 "register_operand" "=&r")
+   (match_operand:TI 1 "memory_operand"   "+ZB")
+   (match_operand:TI 2 "register_operand" "rJ")
+   (match_operand:SI 3 "const_int_operand")] ;; model
+  "TARGET_64BIT && ISA_HAS_SCQ"
+{
+  emit_insn (gen_atomic_exchangeti_scq (operands[0], operands[1],
+					operands[2]));
+  DONE;
+})
+
 (define_insn "atomic_exchange<mode>_short"
   [(set (match_operand:SHORT 0 "register_operand" "=&r")
 	(unspec_volatile:SHORT
@@ -231,7 +410,7 @@
    (set (match_dup 1)
 	(match_operand:SHORT 2 "register_operand" "r"))]
   "ISA_HAS_LAM_BH"
-  "amswap%A3.<amo>\t%0,%z2,%1"
+  "amswap%A3.<size>\t%0,%z2,%1"
   [(set (attr "length") (const_int 4))])
 
 (define_insn "atomic_cas_value_strong<mode>"
@@ -240,13 +419,13 @@
    (set (match_dup 1)
 	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")
 			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")
-			      (match_operand:SI 4 "const_int_operand")]  ;; mod_s
+			      (match_operand:SI 4 "const_int_operand")]  ;; mod_f
 	 UNSPEC_COMPARE_AND_SWAP))
    (clobber (match_scratch:GPR 5 "=&r"))]
   ""
 {
   output_asm_insn ("1:", operands);
-  output_asm_insn ("ll.<amo>\t%0,%1", operands);
+  output_asm_insn ("ll.<size>\t%0,%1", operands);
 
   /* Like the test case atomic-cas-int.C, in loongarch64, O1 and higher, the
      return value of the val_without_const_folding will not be truncated and
@@ -266,9 +445,9 @@
     output_asm_insn ("bne\t%0,%z2,2f", operands);
 
   output_asm_insn ("or%i3\t%5,$zero,%3", operands);
-  output_asm_insn ("sc.<amo>\t%5,%1", operands);
+  output_asm_insn ("sc.<size>\t%5,%1", operands);
   output_asm_insn ("beqz\t%5,1b", operands);
-  output_asm_insn ("b\t3f", operands);
+  output_asm_insn ("%T4b\t3f", operands);
   output_asm_insn ("2:", operands);
   output_asm_insn ("%G4", operands);
   output_asm_insn ("3:", operands);
@@ -288,10 +467,10 @@
    (set (match_dup 1)
 	(unspec_volatile:QHWD [(match_operand:QHWD 2 "reg_or_0_operand" "rJ")
 			       (match_operand:QHWD 3 "reg_or_0_operand" "rJ")
-			       (match_operand:SI 4 "const_int_operand")]  ;; mod_s
-	 UNSPEC_COMPARE_AND_SWAP))]
+			       (match_operand:SI 4 "const_int_operand")]  ;; mod
+	 UNSPEC_COMPARE_AND_SWAP_AMCAS))]
   "ISA_HAS_LAMCAS"
-  "ori\t%0,%z2,0\n\tamcas%A4.<amo>\t%0,%z3,%1"
+  "ori\t%0,%z2,0\n\tamcas%A4.<size>\t%0,%z3,%1"
   [(set (attr "length") (const_int 8))])
 
 (define_expand "atomic_compare_and_swap<mode>"
@@ -318,16 +497,14 @@
       && is_mm_release (memmodel_base (INTVAL (mod_s))))
     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
 
-  operands[6] = mod_s;
-
   if (ISA_HAS_LAMCAS)
     emit_insn (gen_atomic_cas_value_strong<mode>_amcas (operands[1], operands[2],
 							 operands[3], operands[4],
-							 operands[6]));
+							 mod_s));
   else
     emit_insn (gen_atomic_cas_value_strong<mode> (operands[1], operands[2],
 						  operands[3], operands[4],
-						  operands[6]));
+						  mod_f));
 
   rtx compare = operands[1];
   if (operands[3] != const0_rtx)
@@ -349,49 +526,74 @@
   DONE;
 })
 
-(define_expand "atomic_test_and_set"
-  [(match_operand:QI 0 "register_operand" "")     ;; bool output
-   (match_operand:QI 1 "memory_operand" "+ZB")    ;; memory
-   (match_operand:SI 2 "const_int_operand" "")]   ;; model
+(define_expand "atomic_fetch_<amop><mode>"
+  [(match_operand:SHORT 0 "register_operand" "")		 ;; output
+   (any_bitwise (match_operand:SHORT 1 "memory_operand"   "+ZB") ;; memory
+		(match_operand:SHORT 2 "reg_or_0_operand" "rJ")) ;; val
+   (match_operand:SI 3 "const_int_operand" "")]			 ;; model
   ""
 {
-  /* We have no QImode atomics, so use the address LSBs to form a mask,
-     then use an aligned SImode atomic.  */
+  /* We have no QI/HImode bitwise atomics, so use the address LSBs to form
+     a mask, then use an aligned SImode atomic.  */
   rtx result = operands[0];
   rtx mem = operands[1];
-  rtx model = operands[2];
+  rtx model = operands[3];
   rtx addr = force_reg (Pmode, XEXP (mem, 0));
-  rtx tmp_reg = gen_reg_rtx (Pmode);
-  rtx zero_reg = gen_rtx_REG (Pmode, 0);
-
+  rtx mask = gen_int_mode (-4, Pmode);
   rtx aligned_addr = gen_reg_rtx (Pmode);
-  emit_move_insn (tmp_reg, gen_rtx_PLUS (Pmode, zero_reg, GEN_INT (-4)));
-  emit_move_insn (aligned_addr, gen_rtx_AND (Pmode, addr, tmp_reg));
+
+  if (!and_operand (mask, Pmode))
+    mask = force_reg (Pmode, mask);
+
+  emit_move_insn (aligned_addr, gen_rtx_AND (Pmode, addr, mask));
 
   rtx aligned_mem = change_address (mem, SImode, aligned_addr);
   set_mem_alias_set (aligned_mem, 0);
 
-  rtx offset = gen_reg_rtx (SImode);
-  emit_move_insn (offset, gen_rtx_AND (SImode, gen_lowpart (SImode, addr),
-				       GEN_INT (3)));
-
   rtx tmp = gen_reg_rtx (SImode);
-  emit_move_insn (tmp, GEN_INT (1));
+  emit_move_insn (tmp, simplify_gen_unary (ZERO_EXTEND, SImode,
+					   operands[2], <MODE>mode));
 
+  /* Note that we have defined SHIFT_COUNT_TRUNCATED to 1, so we don't need
+     to mask addr with 0b11 here.  */
   rtx shmt = gen_reg_rtx (SImode);
-  emit_move_insn (shmt, gen_rtx_ASHIFT (SImode, offset, GEN_INT (3)));
+  emit_move_insn (shmt, gen_rtx_ASHIFT (SImode, gen_lowpart (SImode, addr),
+					GEN_INT (3)));
 
   rtx word = gen_reg_rtx (SImode);
   emit_move_insn (word, gen_rtx_ASHIFT (SImode, tmp, shmt));
 
+  if (<is_and>)
+    {
+      /* word = word | ~(mode_mask << shmt) */
+      rtx tmp = force_reg (SImode,
+			   gen_int_mode (GET_MODE_MASK (<MODE>mode),
+					 SImode));
+      emit_move_insn (tmp, gen_rtx_ASHIFT (SImode, tmp, shmt));
+      emit_move_insn (word, gen_rtx_IOR (SImode, gen_rtx_NOT (SImode, tmp),
+					 word));
+    }
+
   tmp = gen_reg_rtx (SImode);
-  emit_insn (gen_atomic_fetch_orsi (tmp, aligned_mem, word, model));
+  emit_insn (gen_atomic_fetch_<amop>si (tmp, aligned_mem, word, model));
 
   emit_move_insn (gen_lowpart (SImode, result),
 		  gen_rtx_LSHIFTRT (SImode, tmp, shmt));
   DONE;
 })
 
+(define_expand "atomic_test_and_set"
+  [(match_operand:QI 0 "register_operand" "")     ;; bool output
+   (match_operand:QI 1 "memory_operand" "+ZB")    ;; memory
+   (match_operand:SI 2 "const_int_operand" "")]   ;; model
+  ""
+{
+  rtx one = force_reg (QImode, gen_int_mode (1, QImode));
+  emit_insn (gen_atomic_fetch_orqi (operands[0], operands[1], one,
+				    operands[2]));
+  DONE;
+})
+
 (define_insn "atomic_cas_value_cmp_and_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")
 	(match_operand:GPR 1 "memory_operand" "+ZC"))
@@ -400,20 +602,20 @@
 			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")
 			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")
 			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")
-			      (match_operand:SI 6 "const_int_operand")] ;; model
+			      (match_operand:SI 6 "const_int_operand")] ;; mod_f
 	 UNSPEC_COMPARE_AND_SWAP))
    (clobber (match_scratch:GPR 7 "=&r"))]
   ""
 {
   return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "ll.<size>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%2\\n\\t"
 	 "bne\\t%7,%z4,2f\\n\\t"
 	 "and\\t%7,%0,%z3\\n\\t"
 	 "or%i5\\t%7,%7,%5\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "sc.<size>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b\\n\\t"
-	 "b\\t3f\\n\\t"
+	 "%T6b\\t3f\\n\\t"
 	 "2:\\n\\t"
 	 "%G6\\n\\t"
 	 "3:\\n\\t";
@@ -444,18 +646,16 @@
       && is_mm_release (memmodel_base (INTVAL (mod_s))))
     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
 
-  operands[6] = mod_s;
-
   if (ISA_HAS_LAMCAS)
     emit_insn (gen_atomic_cas_value_strong<mode>_amcas (operands[1], operands[2],
 						       operands[3], operands[4],
-						       operands[6]));
+						       mod_s));
   else
     {
       union loongarch_gen_fn_ptrs generator;
       generator.fn_7 = gen_atomic_cas_value_cmp_and_7_si;
       loongarch_expand_atomic_qihi (generator, operands[1], operands[2],
-				    operands[3], operands[4], operands[6]);
+				    operands[3], operands[4], mod_f);
     }
 
       rtx compare = operands[1];
@@ -481,83 +681,96 @@
   DONE;
 })
 
-(define_insn "atomic_cas_value_add_7_<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
-	(match_operand:GPR 1 "memory_operand" "+ZC"))
+(define_insn "atomic_compare_and_swapti_scq"
+  [(set (match_operand:TI 0 "register_operand" "=&r")
+	(match_operand:TI 1 "memory_operand"   "+ZB"))
    (set (match_dup 1)
-	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
-			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
-			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
-			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
-			      (match_operand:SI 6 "const_int_operand")]		;; model
-	 UNSPEC_COMPARE_AND_SWAP_ADD))
-   (clobber (match_scratch:GPR 7 "=&r"))
-   (clobber (match_scratch:GPR 8 "=&r"))]
-  ""
+	(unspec_volatile:TI [(match_operand:TI 2 "reg_or_0_operand" "rJ")
+			     (match_operand:TI 3 "reg_or_0_operand" "rJ")
+			     (match_operand:SI 4 "const_int_operand")]  ;; mod_f
+	 UNSPEC_COMPARE_AND_SWAP))
+   (clobber (match_scratch:DI 5 "=&r"))]
+  "TARGET_64BIT && ISA_HAS_SCQ"
 {
-  return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
-	 "and\\t%7,%0,%3\\n\\t"
-	 "add.w\\t%8,%0,%z5\\n\\t"
-	 "and\\t%8,%8,%z2\\n\\t"
-	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
-	 "beq\\t$zero,%7,1b";
-}
+  output_asm_insn ("1:", operands);
+  output_asm_insn ("ll.d\t%0,%1", operands);
 
-  [(set (attr "length") (const_int 28))])
+  /* Compare the low word */
+  output_asm_insn ("bne\t%0,%z2,2f", operands);
 
-(define_insn "atomic_cas_value_sub_7_<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
-	(match_operand:GPR 1 "memory_operand" "+ZC"))
-   (set (match_dup 1)
-	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
-			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
-			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
-			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
-			      (match_operand:SI 6 "const_int_operand")]		;; model
-	 UNSPEC_COMPARE_AND_SWAP_SUB))
-   (clobber (match_scratch:GPR 7 "=&r"))
-   (clobber (match_scratch:GPR 8 "=&r"))]
-  ""
-{
-  return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
-	 "and\\t%7,%0,%3\\n\\t"
-	 "sub.w\\t%8,%0,%z5\\n\\t"
-	 "and\\t%8,%8,%z2\\n\\t"
-	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
-	 "beq\\t$zero,%7,1b";
+  /* Don't reorder the load of high word before ll.d.  As the TImode
+     must be aligned in the memory, the high and low words must be in
+     the same cacheline, thus dbar 0x700 is enough.  */
+  if (!ISA_HAS_LD_SEQ_SA)
+    output_asm_insn ("dbar\t0x700", operands);
+
+  /* Now load the high word.  As the high and low words are in the same
+     cacheline, in case another core has clobbered the high word before the
+     sc.q instruction is executed, the LL bit for the low word will be
+     cleared.  Thus a normal load is sufficient.  */
+  output_asm_insn ("ld.d\t%t0,%b1,8", operands);
+
+  /* Compare the high word.  */
+  output_asm_insn ("bne\t%t0,%t2,2f", operands);
+
+  /* Copy the low word of the new value as it'll be clobbered by sc.q.  */
+  output_asm_insn ("move\t%5,%z3", operands);
+
+  /* Store both words if LL bit is still set.  */
+  output_asm_insn ("sc.q\t%5,%t3,%1", operands);
+
+  /* Check if sc.q has done the store.  */
+  output_asm_insn ("beqz\t%5,1b", operands);
+
+  /* Jump over the mod_f barrier if sc.q has succeeded.  */
+  output_asm_insn ("%T4b\t3f", operands);
+
+  /* The barrier for mod_f.  */
+  output_asm_insn ("2:", operands);
+  output_asm_insn ("%G4", operands);
+
+  output_asm_insn ("3:", operands);
+  return "";
 }
-  [(set (attr "length") (const_int 28))])
+  [(set_attr "length" "40")])
 
-(define_insn "atomic_cas_value_and_7_<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
-	(match_operand:GPR 1 "memory_operand" "+ZC"))
-   (set (match_dup 1)
-	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ")	;; mask
-			      (match_operand:GPR 3 "reg_or_0_operand" "rJ")	;; inverted_mask
-			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
-			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
-			      (match_operand:SI 6 "const_int_operand")]		;; model
-	 UNSPEC_COMPARE_AND_SWAP_AND))
-   (clobber (match_scratch:GPR 7 "=&r"))
-   (clobber (match_scratch:GPR 8 "=&r"))]
-  ""
+(define_expand "atomic_compare_and_swapti"
+  [(match_operand:SI 0 "register_operand" "")   ;; bool output
+   (match_operand:TI 1 "register_operand" "")  ;; val output
+   (match_operand:TI 2 "memory_operand" "")    ;; memory
+   (match_operand:TI 3 "reg_or_0_operand" "")  ;; expected value
+   (match_operand:TI 4 "reg_or_0_operand" "")  ;; desired value
+   (match_operand:SI 5 "const_int_operand" "")  ;; is_weak
+   (match_operand:SI 6 "const_int_operand" "")  ;; mod_s
+   (match_operand:SI 7 "const_int_operand" "")] ;; mod_f
+  "TARGET_64BIT && ISA_HAS_SCQ"
 {
-  return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
-	 "and\\t%7,%0,%3\\n\\t"
-	 "and\\t%8,%0,%z5\\n\\t"
-	 "and\\t%8,%8,%z2\\n\\t"
-	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
-	 "beq\\t$zero,%7,1b";
-}
-  [(set (attr "length") (const_int 28))])
+  emit_insn (gen_atomic_compare_and_swapti_scq (operands[1], operands[2],
+						operands[3], operands[4],
+						operands[7]));
+
+  rtx t[2];
 
-(define_insn "atomic_cas_value_xor_7_<mode>"
+  for (int i = 0; i < 2; i++)
+    {
+      rtx compare = loongarch_subword (operands[1], i);
+      rtx expect = loongarch_subword (operands[3], i);
+
+      t[i] = gen_reg_rtx (DImode);
+
+      if (expect != const0_rtx)
+	emit_insn (gen_xordi3 (t[i], compare, expect));
+      else
+	emit_move_insn (t[i], compare);
+    }
+
+  emit_insn (gen_iordi3 (t[0], t[0], t[1]));
+  emit_insn (gen_rtx_SET (operands[0],
+			  gen_rtx_EQ (SImode, t[0], const0_rtx)));
+  DONE;
+})
+
+(define_insn "atomic_cas_value_add_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
 	(match_operand:GPR 1 "memory_operand" "+ZC"))
    (set (match_dup 1)
@@ -566,24 +779,24 @@
 			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
 			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
 			      (match_operand:SI 6 "const_int_operand")]		;; model
-	 UNSPEC_COMPARE_AND_SWAP_XOR))
+	 UNSPEC_COMPARE_AND_SWAP_ADD))
    (clobber (match_scratch:GPR 7 "=&r"))
    (clobber (match_scratch:GPR 8 "=&r"))]
   ""
 {
   return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "ll.<size>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
-	 "xor\\t%8,%0,%z5\\n\\t"
+	 "add.w\\t%8,%0,%z5\\n\\t"
 	 "and\\t%8,%8,%z2\\n\\t"
 	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "sc.<size>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b";
 }
 
   [(set (attr "length") (const_int 28))])
 
-(define_insn "atomic_cas_value_or_7_<mode>"
+(define_insn "atomic_cas_value_sub_7_<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&r")				;; res
 	(match_operand:GPR 1 "memory_operand" "+ZC"))
    (set (match_dup 1)
@@ -592,21 +805,20 @@
 			      (match_operand:GPR 4 "reg_or_0_operand"  "rJ")	;; old val
 			      (match_operand:GPR 5 "reg_or_0_operand"  "rJ")	;; new val
 			      (match_operand:SI 6 "const_int_operand")]		;; model
-	 UNSPEC_COMPARE_AND_SWAP_OR))
+	 UNSPEC_COMPARE_AND_SWAP_SUB))
    (clobber (match_scratch:GPR 7 "=&r"))
    (clobber (match_scratch:GPR 8 "=&r"))]
   ""
 {
   return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "ll.<size>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
-	 "or\\t%8,%0,%z5\\n\\t"
+	 "sub.w\\t%8,%0,%z5\\n\\t"
 	 "and\\t%8,%8,%z2\\n\\t"
 	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "sc.<size>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b";
 }
-
   [(set (attr "length") (const_int 28))])
 
 (define_insn "atomic_cas_value_nand_7_<mode>"
@@ -624,12 +836,12 @@
   ""
 {
   return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "ll.<size>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%3\\n\\t"
 	 "and\\t%8,%0,%z5\\n\\t"
 	 "xor\\t%8,%8,%z2\\n\\t"
 	 "or%i8\\t%7,%7,%8\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "sc.<size>\\t%7,%1\\n\\t"
 	 "beq\\t$zero,%7,1b";
 }
   [(set (attr "length") (const_int 28))])
@@ -648,10 +860,10 @@
   ""
 {
   return "1:\\n\\t"
-	 "ll.<amo>\\t%0,%1\\n\\t"
+	 "ll.<size>\\t%0,%1\\n\\t"
 	 "and\\t%7,%0,%z3\\n\\t"
 	 "or%i5\\t%7,%7,%5\\n\\t"
-	 "sc.<amo>\\t%7,%1\\n\\t"
+	 "sc.<size>\\t%7,%1\\n\\t"
 	 "beqz\\t%7,1b\\n\\t";
 }
   [(set (attr "length") (const_int 20))])
@@ -678,6 +890,101 @@
   DONE;
 })
 
+(define_int_iterator UNSPEC_TI_FETCH_DIRECT
+  [UNSPEC_TI_FETCH_ADD
+   UNSPEC_TI_FETCH_SUB
+   UNSPEC_TI_FETCH_AND
+   UNSPEC_TI_FETCH_XOR
+   UNSPEC_TI_FETCH_OR])
+(define_int_iterator UNSPEC_TI_FETCH
+  [UNSPEC_TI_FETCH_DIRECT UNSPEC_TI_FETCH_NAND_MASK_INVERTED])
+(define_int_attr amop_ti_fetch
+  [(UNSPEC_TI_FETCH_ADD "add")
+   (UNSPEC_TI_FETCH_SUB "sub")
+   (UNSPEC_TI_FETCH_AND "and")
+   (UNSPEC_TI_FETCH_XOR "xor")
+   (UNSPEC_TI_FETCH_OR "or")
+   (UNSPEC_TI_FETCH_NAND_MASK_INVERTED "nand_mask_inverted")])
+(define_int_attr size_ti_fetch
+  [(UNSPEC_TI_FETCH_ADD "36")
+   (UNSPEC_TI_FETCH_SUB "36")
+   (UNSPEC_TI_FETCH_AND "28")
+   (UNSPEC_TI_FETCH_XOR "28")
+   (UNSPEC_TI_FETCH_OR "28")
+   (UNSPEC_TI_FETCH_NAND_MASK_INVERTED "28")])
+
+(define_insn "atomic_fetch_<amop_ti_fetch>ti_scq"
+  [(set (match_operand:TI 0 "register_operand" "=&r")
+        (match_operand:TI 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:TI
+	  [(match_dup 0)
+	   (match_operand:TI 2 "reg_or_0_operand" "rJ")]
+	  UNSPEC_TI_FETCH))
+   (clobber (match_scratch:DI 3 "=&r"))
+   (clobber (match_scratch:DI 4 "=&r"))]
+  "TARGET_64BIT && ISA_HAS_SCQ"
+{
+  output_asm_insn ("1:", operands);
+  output_asm_insn ("ll.d\t%0,%1", operands);
+  if (!ISA_HAS_LD_SEQ_SA)
+    output_asm_insn ("dbar\t0x700", operands);
+  output_asm_insn ("ld.d\t%t0,%b1,8", operands);
+
+  switch (<UNSPEC_TI_FETCH>)
+    {
+    case UNSPEC_TI_FETCH_AND:
+    case UNSPEC_TI_FETCH_OR:
+    case UNSPEC_TI_FETCH_XOR:
+      output_asm_insn ("<amop_ti_fetch>\t%3,%0,%z2", operands);
+      output_asm_insn ("<amop_ti_fetch>\t%4,%t0,%t2", operands);
+      break;
+    case UNSPEC_TI_FETCH_NAND_MASK_INVERTED:
+      output_asm_insn ("orn\t%3,%z2,%0", operands);
+      output_asm_insn ("orn\t%4,%t2,%t0", operands);
+      break;
+    case UNSPEC_TI_FETCH_ADD:
+    case UNSPEC_TI_FETCH_SUB:
+      output_asm_insn ("<amop_ti_fetch>.d\t%3,%0,%z2", operands);
+
+      /* Generate carry bit.  */
+      output_asm_insn (
+	<UNSPEC_TI_FETCH> == UNSPEC_TI_FETCH_ADD ? "sltu\t%4,%3,%0"
+						 : "sltu\t%4,%0,%3",
+	operands);
+
+      output_asm_insn ("<amop_ti_fetch>.d\t%4,%t0,%4", operands);
+      output_asm_insn ("<amop_ti_fetch>.d\t%4,%4,%t2", operands);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  output_asm_insn ("sc.q\t%3,%4,%1", operands);
+  output_asm_insn ("beqz\t%3,1b", operands);
+
+  return "";
+}
+  [(set_attr "length" "<size_ti_fetch>")])
+
+(define_expand "atomic_fetch_<amop_ti_fetch>ti"
+  [(set (match_operand:TI 0 "register_operand" "=&r")
+        (match_operand:TI 1 "memory_operand" "+ZB"))
+   (set (match_dup 1)
+	(unspec_volatile:TI
+	  [(match_dup 0)
+	   (match_operand:TI 2 "reg_or_0_operand" "rJ")]
+	  UNSPEC_TI_FETCH_DIRECT))
+   (match_operand:SI    3 "const_int_operand")] ;; model
+  "TARGET_64BIT && ISA_HAS_SCQ"
+{
+  /* Model is ignored as sc.q implies a full barrier.  */
+  emit_insn (gen_atomic_fetch_<amop_ti_fetch>ti_scq (operands[0],
+						     operands[1],
+						     operands[2]));
+  DONE;
+})
+
 (define_insn "atomic_fetch_add<mode>_short"
   [(set (match_operand:SHORT 0 "register_operand" "=&r")
 	(match_operand:SHORT 1 "memory_operand" "+ZB"))
@@ -688,7 +995,7 @@
 	   (match_operand:SI 3 "const_int_operand")] ;; model
 	 UNSPEC_SYNC_OLD_OP))]
   "ISA_HAS_LAM_BH"
-  "amadd%A3.<amo>\t%0,%z2,%1"
+  "amadd%A3.<size>\t%0,%z2,%1"
   [(set (attr "length") (const_int 4))])
 
 (define_expand "atomic_fetch_add<mode>"
@@ -724,7 +1031,7 @@
 			(match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
 	   (match_operand:SI 3 "const_int_operand")] ;; model
 	 UNSPEC_SYNC_OLD_OP))]
-  ""
+  "!ISA_HAS_LAM_BH"
 {
   union loongarch_gen_fn_ptrs generator;
   generator.fn_7 = gen_atomic_cas_value_sub_7_si;
@@ -733,60 +1040,6 @@
   DONE;
 })
 
-(define_expand "atomic_fetch_and<mode>"
-  [(set (match_operand:SHORT 0 "register_operand" "=&r")
-	(match_operand:SHORT 1 "memory_operand" "+ZB"))
-   (set (match_dup 1)
-	(unspec_volatile:SHORT
-	  [(and:SHORT (match_dup 1)
-		      (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
-	   (match_operand:SI 3 "const_int_operand")] ;; model
-	 UNSPEC_SYNC_OLD_OP))]
-  ""
-{
-  union loongarch_gen_fn_ptrs generator;
-  generator.fn_7 = gen_atomic_cas_value_and_7_si;
-  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
-				operands[1], operands[2], operands[3]);
-  DONE;
-})
-
-(define_expand "atomic_fetch_xor<mode>"
-  [(set (match_operand:SHORT 0 "register_operand" "=&r")
-	(match_operand:SHORT 1 "memory_operand" "+ZB"))
-   (set (match_dup 1)
-	(unspec_volatile:SHORT
-	  [(xor:SHORT (match_dup 1)
-		      (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
-	   (match_operand:SI 3 "const_int_operand")] ;; model
-	 UNSPEC_SYNC_OLD_OP))]
-  ""
-{
-  union loongarch_gen_fn_ptrs generator;
-  generator.fn_7 = gen_atomic_cas_value_xor_7_si;
-  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
-				operands[1], operands[2], operands[3]);
-  DONE;
-})
-
-(define_expand "atomic_fetch_or<mode>"
-  [(set (match_operand:SHORT 0 "register_operand" "=&r")
-	(match_operand:SHORT 1 "memory_operand" "+ZB"))
-   (set (match_dup 1)
-	(unspec_volatile:SHORT
-	  [(ior:SHORT (match_dup 1)
-		      (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))
-	   (match_operand:SI 3 "const_int_operand")] ;; model
-	 UNSPEC_SYNC_OLD_OP))]
-  ""
-{
-  union loongarch_gen_fn_ptrs generator;
-  generator.fn_7 = gen_atomic_cas_value_or_7_si;
-  loongarch_expand_atomic_qihi (generator, operands[0], operands[1],
-				operands[1], operands[2], operands[3]);
-  DONE;
-})
-
 (define_expand "atomic_fetch_nand<mode>"
   [(set (match_operand:SHORT 0 "register_operand" "=&r")
 	(match_operand:SHORT 1 "memory_operand" "+ZB"))
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index e224ade..494f14c 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -2363,8 +2363,14 @@ enum reg_class
 
 #define STACK_GROWS_DOWNWARD 1
 
-#define FRAME_GROWS_DOWNWARD (flag_stack_protect != 0			\
-			      || (flag_sanitize & SANITIZE_ADDRESS) != 0)
+/* Growing the frame downwards allows us to put spills closest to
+   the stack pointer which is good as they are likely to be accessed
+   frequently.  We can also arrange for normal stack usage to place
+   scalars last so that they too are close to the stack pointer.  */
+#define FRAME_GROWS_DOWNWARD ((TARGET_MIPS16			    \
+			       && TARGET_FRAME_GROWS_DOWNWARDS)     \
+			      || (flag_stack_protect != 0	    \
+				  || (flag_sanitize & SANITIZE_ADDRESS) != 0))
 
 /* Size of the area allocated in the frame to save the GP.  */
 
diff --git a/gcc/config/mips/mips.opt b/gcc/config/mips/mips.opt
index e245654..f07db5a 100644
--- a/gcc/config/mips/mips.opt
+++ b/gcc/config/mips/mips.opt
@@ -473,6 +473,10 @@ mframe-header-opt
 Target Var(flag_frame_header_optimization) Optimization
 Optimize frame header.
 
+mgrow-frame-downwards
+Target Var(TARGET_FRAME_GROWS_DOWNWARDS) Init(1) Undocumented
+Change the behaviour to grow the frame downwards.
+
 noasmopt
 Driver
 
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index d326ca4..9796839 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -120,6 +120,51 @@ Target RejectNegative Alias(misa=,sm_89)
 march-map=sm_90a
 Target RejectNegative Alias(misa=,sm_89)
 
+march-map=sm_100
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_100f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_100a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_101
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_101f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_101a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_103
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_103f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_103a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_120
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_120f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_120a
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_121
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_121f
+Target RejectNegative Alias(misa=,sm_89)
+
+march-map=sm_121a
+Target RejectNegative Alias(misa=,sm_89)
+
 Enum
 Name(ptx_version) Type(enum ptx_version)
 Known PTX ISA versions (for use with the -mptx= option):
diff --git a/gcc/config/pru/pru.cc b/gcc/config/pru/pru.cc
index 322e319..3fdc56e 100644
--- a/gcc/config/pru/pru.cc
+++ b/gcc/config/pru/pru.cc
@@ -941,10 +941,19 @@ pru_init_libfuncs (void)
 
   /* Long long.  */
   set_optab_libfunc (ashr_optab, DImode, "__pruabi_asrll");
-  set_optab_libfunc (smul_optab, DImode, "__pruabi_mpyll");
   set_optab_libfunc (ashl_optab, DImode, "__pruabi_lslll");
   set_optab_libfunc (lshr_optab, DImode, "__pruabi_lsrll");
 
+  if (TARGET_OPT_MUL)
+    {
+      set_optab_libfunc (smul_optab, DImode, "__pruabi_mpyll");
+    }
+  else
+    {
+      set_optab_libfunc (smul_optab, DImode, "__pruabi_softmpyll");
+      set_optab_libfunc (smul_optab, SImode, "__pruabi_softmpyi");
+    }
+
   set_optab_libfunc (sdiv_optab, SImode, "__pruabi_divi");
   set_optab_libfunc (udiv_optab, SImode, "__pruabi_divu");
   set_optab_libfunc (smod_optab, SImode, "__pruabi_remi");
diff --git a/gcc/config/pru/pru.h b/gcc/config/pru/pru.h
index 6c0719b..9d547ed 100644
--- a/gcc/config/pru/pru.h
+++ b/gcc/config/pru/pru.h
@@ -65,6 +65,9 @@
 #undef  ENDFILE_SPEC
 #define ENDFILE_SPEC "%{!mabi=ti:-lgloss} "
 
+#undef  MULTILIB_DEFAULTS
+#define MULTILIB_DEFAULTS { "mloop", "mmul", "mfillzero" }
+
 /* TI ABI mandates that ELF symbols do not start with any prefix.  */
 #undef USER_LABEL_PREFIX
 #define USER_LABEL_PREFIX ""
diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md
index 3504e42..b8ef55b 100644
--- a/gcc/config/pru/pru.md
+++ b/gcc/config/pru/pru.md
@@ -215,7 +215,7 @@
     mov\\t%0, %1
     ldi\\t%0, %%pmem(%1)
     ldi\\t%0, %1
-    fill\\t%0, 4
+    * return TARGET_OPT_FILLZERO ? \"fill\\t%0, 4\" : \"ldi32\\t%0, 0xffffffff\";
     ldi32\\t%0, %1"
   [(set_attr "type" "st,ld,alu,alu,alu,alu,alu")
    (set_attr "length" "4,4,4,4,4,4,8")])
@@ -259,9 +259,11 @@
     case 1:
       return "lb%B1o\\t%b0, %1, %S1";
     case 2:
-      return "zero\\t%F0, 8";
+      return TARGET_OPT_FILLZERO ? "zero\\t%F0, 8"
+				 : "ldi\\t%F0, 0\;ldi\\t%N0, 0";
     case 3:
-      return "fill\\t%F0, 8";
+      return TARGET_OPT_FILLZERO ? "fill\\t%F0, 8"
+				 : "ldi32\\t%F0, 0xffffffff\;mov\\t%N0, %F0";
     case 4:
       /* careful with overlapping source and destination regs.  */
       gcc_assert (GP_REG_P (REGNO (operands[0])));
@@ -502,7 +504,7 @@
 (define_insn "zero_extendqidi2"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
 	(zero_extend:DI (match_operand:QI 1 "register_operand" "0,r")))]
-  ""
+  "TARGET_OPT_FILLZERO"
   "@
     zero\\t%F0.b1, 7
     mov\\t%F0.b0, %1\;zero\\t%F0.b1, 7"
@@ -512,7 +514,7 @@
 (define_insn "zero_extendhidi2"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
 	(zero_extend:DI (match_operand:HI 1 "register_operand" "0,r")))]
-  ""
+  "TARGET_OPT_FILLZERO"
   "@
     zero\\t%F0.b2, 6
     mov\\t%F0.w0, %1\;zero\\t%F0.b2, 6"
@@ -522,7 +524,7 @@
 (define_insn "zero_extendsidi2"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
 	(zero_extend:DI (match_operand:SI 1 "register_operand" "0,r")))]
-  ""
+  "TARGET_OPT_FILLZERO"
   "@
     zero\\t%N0, 4
     mov\\t%F0, %1\;zero\\t%N0, 4"
@@ -535,7 +537,7 @@
 (define_expand "extend<EQS0:mode><EQDHIDI:mode>2"
   [(set (match_operand:EQDHIDI 0 "register_operand" "=r")
 	(sign_extend:EQDHIDI (match_operand:EQS0 1 "register_operand" "r")))]
-  ""
+  "TARGET_OPT_FILLZERO"
 {
   rtx_code_label *skip_hiset_label;
 
@@ -744,7 +746,7 @@
 	(ior:HIDI
 	   (match_operand:HIDI 1 "register_operand" "0")
 	   (match_operand:HIDI 2 "const_fillbytes_operand" "Uf")))]
-  ""
+  "TARGET_OPT_FILLZERO"
 {
   static char line[64];
   pru_byterange r;
@@ -767,7 +769,7 @@
 	(and:HIDI
 	   (match_operand:HIDI 1 "register_operand" "0")
 	   (match_operand:HIDI 2 "const_zerobytes_operand" "Uz")))]
-  ""
+  "TARGET_OPT_FILLZERO"
 {
   static char line[64];
   pru_byterange r;
@@ -1114,7 +1116,8 @@
   /* Try with the more efficient zero/fill patterns first.  */
   if (<LOGICAL_BITOP:CODE> == IOR
       && CONST_INT_P (operands[2])
-      && const_fillbytes_operand (operands[2], DImode))
+      && const_fillbytes_operand (operands[2], DImode)
+      && TARGET_OPT_FILLZERO)
     {
       rtx insn = maybe_gen_pru_ior_fillbytes (DImode,
 					      operands[0],
@@ -1130,7 +1133,8 @@
     }
   if (<LOGICAL_BITOP:CODE> == AND
       && CONST_INT_P (operands[2])
-      && const_zerobytes_operand (operands[2], DImode))
+      && const_zerobytes_operand (operands[2], DImode)
+      && TARGET_OPT_FILLZERO)
     {
       rtx insn = maybe_gen_pru_and_zerobytes (DImode,
 					      operands[0],
@@ -1212,7 +1216,7 @@
   [(set (match_operand:SI 0 "pru_muldst_operand"	   "=Rmd0")
 	(mult:SI (match_operand:SI 1 "pru_mulsrc0_operand" "%Rms0")
 		 (match_operand:SI 2 "pru_mulsrc1_operand" "Rms1")))]
-  ""
+  "TARGET_OPT_MUL"
   "nop\;xin\\t0, %0, 4"
   [(set_attr "type" "alu")
    (set_attr "length" "8")])
diff --git a/gcc/config/pru/pru.opt b/gcc/config/pru/pru.opt
index 8385beb..5206b2a 100644
--- a/gcc/config/pru/pru.opt
+++ b/gcc/config/pru/pru.opt
@@ -39,6 +39,14 @@ mloop
 Target Mask(OPT_LOOP)
 Allow (or do not allow) gcc to use the LOOP instruction.
 
+mmul
+Target Mask(OPT_MUL)
+Allow (or do not allow) gcc to use the PRU multiplier unit.
+
+mfillzero
+Target Mask(OPT_FILLZERO)
+Allow (or do not allow) gcc to use the FILL and ZERO instructions.
+
 mabi=
 Target RejectNegative Joined Enum(pru_abi_t) Var(pru_current_abi) Init(PRU_ABI_GNU) Save
 Select target ABI variant.
diff --git a/gcc/config/pru/pru.opt.urls b/gcc/config/pru/pru.opt.urls
index c87affb..5c57892 100644
--- a/gcc/config/pru/pru.opt.urls
+++ b/gcc/config/pru/pru.opt.urls
@@ -12,6 +12,12 @@ UrlSuffix(gcc/PRU-Options.html#index-mno-relax-1)
 mloop
 UrlSuffix(gcc/PRU-Options.html#index-mloop)
 
+mmul
+UrlSuffix(gcc/PRU-Options.html#index-mmul)
+
+mfillzero
+UrlSuffix(gcc/PRU-Options.html#index-mfillzero)
+
 mabi=
 UrlSuffix(gcc/PRU-Options.html#index-mabi-4)
 
diff --git a/gcc/config/pru/t-multilib b/gcc/config/pru/t-multilib
new file mode 100644
index 0000000..1e3c2b8
--- /dev/null
+++ b/gcc/config/pru/t-multilib
@@ -0,0 +1,29 @@
+# Copyright (C) 2025 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+#
+# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+MULTILIB_OPTIONS   =
+MULTILIB_OPTIONS  += mloop/mno-loop
+MULTILIB_OPTIONS  += mmul/mno-mul
+MULTILIB_OPTIONS  += mfillzero/mno-fillzero
+
+# Build two variants:
+#   - Newer PRU core versions, present in AM335x and later.
+#   - Older PRU core versions, present in AM18xx.
+MULTILIB_REQUIRED  =
+MULTILIB_REQUIRED += mloop/mmul/mfillzero
+MULTILIB_REQUIRED += mno-loop/mno-mul/mno-fillzero
diff --git a/gcc/config/riscv/arch-canonicalize b/gcc/config/riscv/arch-canonicalize
index fd55255..15a3985 100755
--- a/gcc/config/riscv/arch-canonicalize
+++ b/gcc/config/riscv/arch-canonicalize
@@ -20,77 +20,326 @@
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-# TODO: Extract riscv_subset_t from riscv-common.cc and make it can be compiled
-#       standalone to replace this script, that also prevents us implementing
-#       that twice and keep sync again and again.
-
 from __future__ import print_function
 import sys
 import argparse
 import collections
 import itertools
+import re
+import os
 from functools import reduce
 
 SUPPORTED_ISA_SPEC = ["2.2", "20190608", "20191213"]
-CANONICAL_ORDER = "imafdgqlcbkjtpvn"
+CANONICAL_ORDER = "imafdqlcbkjtpvnh"
 LONG_EXT_PREFIXES = ['z', 's', 'h', 'x']
 
+def parse_define_riscv_ext(content):
+  """Parse DEFINE_RISCV_EXT macros using position-based parsing."""
+  extensions = []
+
+  # Find all DEFINE_RISCV_EXT blocks
+  pattern = r'DEFINE_RISCV_EXT\s*\('
+  matches = []
+
+  pos = 0
+  while True:
+    match = re.search(pattern, content[pos:])
+    if not match:
+      break
+
+    start_pos = pos + match.start()
+    paren_count = 0
+    current_pos = pos + match.end() - 1  # Start at the opening parenthesis
+
+    # Find the matching closing parenthesis
+    while current_pos < len(content):
+      if content[current_pos] == '(':
+        paren_count += 1
+      elif content[current_pos] == ')':
+        paren_count -= 1
+        if paren_count == 0:
+          break
+      current_pos += 1
+
+    if paren_count == 0:
+      # Extract the content inside parentheses
+      macro_content = content[pos + match.end():current_pos]
+      ext_data = parse_macro_arguments(macro_content)
+      if ext_data:
+        extensions.append(ext_data)
+
+    pos = current_pos + 1
+
+  return extensions
+
+def parse_macro_arguments(macro_content):
+  """Parse the arguments of a DEFINE_RISCV_EXT macro."""
+  # Remove comments /* ... */
+  cleaned_content = re.sub(r'/\*[^*]*\*/', '', macro_content)
+
+  # Split arguments by comma, but respect nested structures
+  args = []
+  current_arg = ""
+  paren_count = 0
+  brace_count = 0
+  in_string = False
+  escape_next = False
+
+  for char in cleaned_content:
+    if escape_next:
+      current_arg += char
+      escape_next = False
+      continue
+
+    if char == '\\':
+      escape_next = True
+      current_arg += char
+      continue
+
+    if char == '"' and not escape_next:
+      in_string = not in_string
+      current_arg += char
+      continue
+
+    if in_string:
+      current_arg += char
+      continue
+
+    if char == '(':
+      paren_count += 1
+    elif char == ')':
+      paren_count -= 1
+    elif char == '{':
+      brace_count += 1
+    elif char == '}':
+      brace_count -= 1
+    elif char == ',' and paren_count == 0 and brace_count == 0:
+      args.append(current_arg.strip())
+      current_arg = ""
+      continue
+
+    current_arg += char
+
+  # Add the last argument
+  if current_arg.strip():
+    args.append(current_arg.strip())
+
+  # We need at least 6 arguments to get DEP_EXTS (position 5)
+  if len(args) < 6:
+    return None
+
+  ext_name = args[0].strip()
+  dep_exts_arg = args[5].strip()  # DEP_EXTS is at position 5
+
+  # Parse dependency extensions from the DEP_EXTS argument
+  deps = parse_dep_exts(dep_exts_arg)
+
+  return {
+    'name': ext_name,
+    'dep_exts': deps
+  }
+
+def parse_dep_exts(dep_exts_str):
+  """Parse the DEP_EXTS argument to extract dependency list with conditions."""
+  # Remove outer parentheses if present
+  dep_exts_str = dep_exts_str.strip()
+  if dep_exts_str.startswith('(') and dep_exts_str.endswith(')'):
+    dep_exts_str = dep_exts_str[1:-1].strip()
+
+  # Remove outer braces if present
+  if dep_exts_str.startswith('{') and dep_exts_str.endswith('}'):
+    dep_exts_str = dep_exts_str[1:-1].strip()
+
+  if not dep_exts_str:
+    return []
+
+  deps = []
+
+  # First, find and process conditional dependencies
+  conditional_pattern = r'\{\s*"([^"]+)"\s*,\s*(\[.*?\]\s*\([^)]*\)\s*->\s*bool.*?)\}'
+  conditional_matches = []
+
+  for match in re.finditer(conditional_pattern, dep_exts_str, re.DOTALL):
+    ext_name = match.group(1)
+    condition_code = match.group(2)
+    deps.append({'ext': ext_name, 'type': 'conditional', 'condition': condition_code})
+    # The conditional_pattern RE matches only the first code block enclosed
+    # in braces.
+    #
+    # Extend the match to the condition block's closing brace, encompassing
+    # all code blocks,  by simply trying to match the numbers of opening
+    # and closing braces.  While crude, this avoids writing a complicated
+    # parse here.
+    closing_braces_left = condition_code.count('{') - condition_code.count('}')
+    condition_end = match.end()
+    while closing_braces_left > 0:
+      condition_end = dep_exts_str.find('}', condition_end)
+      closing_braces_left -= 1
+    conditional_matches.append((match.start(), condition_end))
+
+  # Remove conditional dependency blocks from the string
+  remaining_str = dep_exts_str
+  for start, end in reversed(conditional_matches):  # Reverse order to maintain indices
+    remaining_str = remaining_str[:start] + remaining_str[end:]
+
+  # Now handle simple quoted strings in the remaining text
+  for match in re.finditer(r'"([^"]+)"', remaining_str):
+    deps.append({'ext': match.group(1), 'type': 'simple'})
+
+  # Remove duplicates while preserving order
+  seen = set()
+  unique_deps = []
+  for dep in deps:
+    key = (dep['ext'], dep['type'])
+    if key not in seen:
+      seen.add(key)
+      unique_deps.append(dep)
+
+  return unique_deps
+
+def evaluate_conditional_dependency(ext, dep, xlen, current_exts):
+  """Evaluate whether a conditional dependency should be included."""
+  ext_name = dep['ext']
+  condition = dep['condition']
+  # Parse the condition based on known patterns
+  if ext_name == 'zcf' and ext in ['zca', 'c', 'zce']:
+    # zcf depends on RV32 and F extension
+    return xlen == 32 and 'f' in current_exts
+  elif ext_name == 'zcd' and ext in ['zca', 'c']:
+    # zcd depends on D extension
+    return 'd' in current_exts
+  elif ext_name == 'c' and ext in ['zca']:
+    # Special case for zca -> c conditional dependency
+    if xlen == 32:
+      if 'd' in current_exts:
+        return 'zcf' in current_exts and 'zcd' in current_exts
+      elif 'f' in current_exts:
+        return 'zcf' in current_exts
+      else:
+        return True
+    elif xlen == 64:
+      if 'd' in current_exts:
+        return 'zcd' in current_exts
+      else:
+        return True
+    return False
+  else:
+    # Report error for unhandled conditional dependencies
+    import sys
+    print(f"ERROR: Unhandled conditional dependency: '{ext_name}' with condition:", file=sys.stderr)
+    print(f"  Condition code: {condition[:100]}...", file=sys.stderr)
+    print(f"  Current context: xlen={xlen}, exts={sorted(current_exts)}", file=sys.stderr)
+    # For now, return False to be safe
+    return False
+
+def resolve_dependencies(arch_parts, xlen):
+  """Resolve all dependencies including conditional ones."""
+  current_exts = set(arch_parts)
+  implied_deps = set()
+
+  # Keep resolving until no new dependencies are found
+  changed = True
+  while changed:
+    changed = False
+    new_deps = set()
+
+    for ext in current_exts | implied_deps:
+      if ext in IMPLIED_EXT:
+        for dep in IMPLIED_EXT[ext]:
+          if dep['type'] == 'simple':
+            if dep['ext'] not in current_exts and dep['ext'] not in implied_deps:
+              new_deps.add(dep['ext'])
+              changed = True
+          elif dep['type'] == 'conditional':
+            should_include = evaluate_conditional_dependency(ext, dep, xlen, current_exts | implied_deps)
+            if should_include:
+              if dep['ext'] not in current_exts and dep['ext'] not in implied_deps:
+                new_deps.add(dep['ext'])
+                changed = True
+
+    implied_deps.update(new_deps)
+
+  return implied_deps
+
+def parse_def_file(file_path, script_dir, processed_files=None, collect_all=False):
+  """Parse a single .def file and recursively process #include directives."""
+  if processed_files is None:
+    processed_files = set()
+
+  # Avoid infinite recursion
+  if file_path in processed_files:
+    return ({}, set()) if collect_all else {}
+  processed_files.add(file_path)
+
+  implied_ext = {}
+  all_extensions = set() if collect_all else None
+
+  if not os.path.exists(file_path):
+    return (implied_ext, all_extensions) if collect_all else implied_ext
+
+  with open(file_path, 'r') as f:
+    content = f.read()
+
+  # Process #include directives first
+  include_pattern = r'#include\s+"([^"]+)"'
+  includes = re.findall(include_pattern, content)
+
+  for include_file in includes:
+    include_path = os.path.join(script_dir, include_file)
+    if collect_all:
+      included_ext, included_all = parse_def_file(include_path, script_dir, processed_files, collect_all)
+      implied_ext.update(included_ext)
+      all_extensions.update(included_all)
+    else:
+      included_ext = parse_def_file(include_path, script_dir, processed_files, collect_all)
+      implied_ext.update(included_ext)
+
+  # Parse DEFINE_RISCV_EXT blocks using position-based parsing
+  parsed_exts = parse_define_riscv_ext(content)
+
+  for ext_data in parsed_exts:
+    ext_name = ext_data['name']
+    deps = ext_data['dep_exts']
+
+    if collect_all:
+      all_extensions.add(ext_name)
+
+    if deps:
+      implied_ext[ext_name] = deps
+
+  return (implied_ext, all_extensions) if collect_all else implied_ext
+
+def parse_def_files():
+  """Parse RISC-V extension definition files starting from riscv-ext.def."""
+  # Get directory containing this script
+  try:
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+  except NameError:
+    # When __file__ is not defined (e.g., interactive mode)
+    script_dir = os.getcwd()
+
+  # Start with the main definition file
+  main_def_file = os.path.join(script_dir, 'riscv-ext.def')
+  return parse_def_file(main_def_file, script_dir)
+
+def get_all_extensions():
+  """Get all supported extensions and their implied extensions."""
+  # Get directory containing this script
+  try:
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+  except NameError:
+    # When __file__ is not defined (e.g., interactive mode)
+    script_dir = os.getcwd()
+
+  # Start with the main definition file
+  main_def_file = os.path.join(script_dir, 'riscv-ext.def')
+  return parse_def_file(main_def_file, script_dir, collect_all=True)
+
 #
 # IMPLIED_EXT(ext) -> implied extension list.
+# This is loaded dynamically from .def files
 #
-IMPLIED_EXT = {
-  "d" : ["f", "zicsr"],
-
-  "a" : ["zaamo", "zalrsc"],
-  "zabha" : ["zaamo"],
-  "zacas" : ["zaamo"],
-
-  "f" : ["zicsr"],
-  "b" : ["zba", "zbb", "zbs"],
-  "zdinx" : ["zfinx", "zicsr"],
-  "zfinx" : ["zicsr"],
-  "zhinx" : ["zhinxmin", "zfinx", "zicsr"],
-  "zhinxmin" : ["zfinx", "zicsr"],
-
-  "zk" : ["zkn", "zkr", "zkt"],
-  "zkn" : ["zbkb", "zbkc", "zbkx", "zkne", "zknd", "zknh"],
-  "zks" : ["zbkb", "zbkc", "zbkx", "zksed", "zksh"],
-
-  "v" : ["zvl128b", "zve64d"],
-  "zve32x" : ["zvl32b"],
-  "zve64x" : ["zve32x", "zvl64b"],
-  "zve32f" : ["f", "zve32x"],
-  "zve64f" : ["f", "zve32f", "zve64x"],
-  "zve64d" : ["d", "zve64f"],
-
-  "zvl64b" : ["zvl32b"],
-  "zvl128b" : ["zvl64b"],
-  "zvl256b" : ["zvl128b"],
-  "zvl512b" : ["zvl256b"],
-  "zvl1024b" : ["zvl512b"],
-  "zvl2048b" : ["zvl1024b"],
-  "zvl4096b" : ["zvl2048b"],
-  "zvl8192b" : ["zvl4096b"],
-  "zvl16384b" : ["zvl8192b"],
-  "zvl32768b" : ["zvl16384b"],
-  "zvl65536b" : ["zvl32768b"],
-
-  "zvkn"   : ["zvkned", "zvknhb", "zvkb", "zvkt"],
-  "zvknc"  : ["zvkn", "zvbc"],
-  "zvkng"  : ["zvkn", "zvkg"],
-  "zvks"   : ["zvksed", "zvksh", "zvkb", "zvkt"],
-  "zvksc"  : ["zvks", "zvbc"],
-  "zvksg"  : ["zvks", "zvkg"],
-  "zvbb"   : ["zvkb"],
-  "zvbc"   : ["zve64x"],
-  "zvkb"   : ["zve32x"],
-  "zvkg"   : ["zve32x"],
-  "zvkned" : ["zve32x"],
-  "zvknha" : ["zve32x"],
-  "zvknhb" : ["zve64x"],
-  "zvksed" : ["zve32x"],
-  "zvksh"  : ["zve32x"],
-}
+IMPLIED_EXT = parse_def_files()
 
 def arch_canonicalize(arch, isa_spec):
   # TODO: Support extension version.
@@ -123,21 +372,31 @@ def arch_canonicalize(arch, isa_spec):
   long_exts += extra_long_ext
 
   #
-  # Handle implied extensions.
+  # Handle implied extensions using new conditional logic.
   #
-  any_change = True
-  while any_change:
-    any_change = False
-    for ext in std_exts + long_exts:
-      if ext in IMPLIED_EXT:
-        implied_exts = IMPLIED_EXT[ext]
-        for implied_ext in implied_exts:
-          if implied_ext == 'zicsr' and is_isa_spec_2p2:
-              continue
+  # Extract xlen from architecture string
+  # TODO: We should support profile here.
+  if arch.startswith('rv32'):
+    xlen = 32
+  elif arch.startswith('rv64'):
+    xlen = 64
+  else:
+    raise Exception("Unsupported prefix `%s`" % arch)
 
-          if implied_ext not in std_exts + long_exts:
-            long_exts.append(implied_ext)
-            any_change = True
+  # Get all current extensions
+  current_exts = std_exts + long_exts
+
+  # Resolve dependencies
+  implied_deps = resolve_dependencies(current_exts, xlen)
+
+  # Filter out zicsr for ISA spec 2.2
+  if is_isa_spec_2p2:
+    implied_deps.discard('zicsr')
+
+  # Add implied dependencies to long_exts
+  for dep in implied_deps:
+    if dep not in current_exts:
+      long_exts.append(dep)
 
   # Single letter extension might appear in the long_exts list,
   # because we just append extensions list to the arch string.
@@ -179,17 +438,177 @@ def arch_canonicalize(arch, isa_spec):
 
   return new_arch
 
-if len(sys.argv) < 2:
-  print ("Usage: %s <arch_str> [<arch_str>*]" % sys.argv)
-  sys.exit(1)
+def dump_all_extensions():
+  """Dump all extensions and their implied extensions."""
+  implied_ext, all_extensions = get_all_extensions()
+
+  print("All supported RISC-V extensions:")
+  print("=" * 60)
+
+  if not all_extensions:
+    print("No extensions found.")
+    return
 
-parser = argparse.ArgumentParser()
-parser.add_argument('-misa-spec', type=str,
-                    default='20191213',
-                    choices=SUPPORTED_ISA_SPEC)
-parser.add_argument('arch_strs', nargs=argparse.REMAINDER)
+  # Sort all extensions for consistent output
+  sorted_all = sorted(all_extensions)
 
-args = parser.parse_args()
+  # Print all extensions with their dependencies (if any)
+  for ext_name in sorted_all:
+    if ext_name in implied_ext:
+      deps = implied_ext[ext_name]
+      dep_strs = []
+      for dep in deps:
+        if dep['type'] == 'simple':
+          dep_strs.append(dep['ext'])
+        else:
+          dep_strs.append(f"{dep['ext']}*")  # Mark conditional deps with *
+      print(f"{ext_name:15} -> {', '.join(dep_strs)}")
+    else:
+      print(f"{ext_name:15} -> (no dependencies)")
+
+  print(f"\nTotal extensions: {len(all_extensions)}")
+  print(f"Extensions with dependencies: {len(implied_ext)}")
+  print(f"Extensions without dependencies: {len(all_extensions) - len(implied_ext)}")
+
+def run_unit_tests():
+  """Run unit tests using pytest dynamically imported."""
+  try:
+    import pytest
+  except ImportError:
+    print("Error: pytest is required for running unit tests.")
+    print("Please install pytest: pip install pytest")
+    return 1
+
+  # Define test functions
+  def test_basic_arch_parsing():
+    """Test basic architecture string parsing."""
+    result = arch_canonicalize("rv64i", "20191213")
+    assert result == "rv64i"
+
+  def test_simple_extensions():
+    """Test simple extension handling."""
+    result = arch_canonicalize("rv64im", "20191213")
+    assert "zmmul" in result
+
+  def test_implied_extensions():
+    """Test implied extension resolution."""
+    result = arch_canonicalize("rv64imaf", "20191213")
+    assert "zicsr" in result
+
+  def test_conditional_dependencies():
+    """Test conditional dependency evaluation."""
+    # Test RV32 with F extension should include zcf when c is present
+    result = arch_canonicalize("rv32ifc", "20191213")
+    parts = result.split("_")
+    if "c" in parts:
+      assert "zca" in parts
+      if "f" in parts:
+        assert "zcf" in parts
+
+  def test_parse_dep_exts():
+    """Test dependency parsing function."""
+    # Test simple dependency
+    deps = parse_dep_exts('{"ext1", "ext2"}')
+    assert len(deps) == 2
+    assert deps[0]['ext'] == 'ext1'
+    assert deps[0]['type'] == 'simple'
+
+  def test_evaluate_conditional_dependency():
+    """Test conditional dependency evaluation."""
+    # Test zcf condition for RV32 with F
+    dep = {'ext': 'zcf', 'type': 'conditional', 'condition': 'test'}
+    result = evaluate_conditional_dependency('zce', dep, 32, {'f'})
+    assert result == True
+
+    # Test zcf condition for RV64 with F (should be False)
+    result = evaluate_conditional_dependency('zce', dep, 64, {'f'})
+    assert result == False
+
+  def test_parse_define_riscv_ext():
+    """Test DEFINE_RISCV_EXT parsing."""
+    content = '''
+    DEFINE_RISCV_EXT(
+      /* NAME */ test,
+      /* UPPERCASE_NAME */ TEST,
+      /* FULL_NAME */ "Test extension",
+      /* DESC */ "",
+      /* URL */ ,
+      /* DEP_EXTS */ ({"dep1", "dep2"}),
+      /* SUPPORTED_VERSIONS */ ({{1, 0}}),
+      /* FLAG_GROUP */ test,
+      /* BITMASK_GROUP_ID */ 0,
+      /* BITMASK_BIT_POSITION*/ 0,
+      /* EXTRA_EXTENSION_FLAGS */ 0)
+    '''
+
+    extensions = parse_define_riscv_ext(content)
+    assert len(extensions) == 1
+    assert extensions[0]['name'] == 'test'
+    assert len(extensions[0]['dep_exts']) == 2
 
-for arch in args.arch_strs:
-  print (arch_canonicalize(arch, args.misa_spec))
+  def test_parse_long_condition_block():
+    """Test condition block containing several code blocks."""
+    result = arch_canonicalize("rv32ec", "20191213")
+    assert "rv32ec_zca" in result
+
+  # Collect test functions
+  test_functions = [
+    test_basic_arch_parsing,
+    test_simple_extensions,
+    test_implied_extensions,
+    test_conditional_dependencies,
+    test_parse_dep_exts,
+    test_evaluate_conditional_dependency,
+    test_parse_define_riscv_ext,
+    test_parse_long_condition_block
+  ]
+
+  # Run tests manually first, then optionally with pytest
+  print("Running unit tests...")
+
+  passed = 0
+  failed = 0
+
+  for i, test_func in enumerate(test_functions):
+    try:
+      print(f"  Running {test_func.__name__}...", end=" ")
+      test_func()
+      print("PASSED")
+      passed += 1
+    except Exception as e:
+      print(f"FAILED: {e}")
+      failed += 1
+
+  print(f"\nTest Summary: {passed} passed, {failed} failed")
+
+  if failed == 0:
+    print("\nAll tests passed!")
+    return 0
+  else:
+    print(f"\n{failed} test(s) failed!")
+    return 1
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument('-misa-spec', type=str,
+                      default='20191213',
+                      choices=SUPPORTED_ISA_SPEC)
+  parser.add_argument('--dump-all', action='store_true',
+                      help='Dump all extensions and their implied extensions')
+  parser.add_argument('--selftest', action='store_true',
+                      help='Run unit tests using pytest')
+  parser.add_argument('arch_strs', nargs='*',
+                      help='Architecture strings to canonicalize')
+
+  args = parser.parse_args()
+
+  if args.dump_all:
+    dump_all_extensions()
+  elif args.selftest:
+    sys.exit(run_unit_tests())
+  elif args.arch_strs:
+    for arch in args.arch_strs:
+      print (arch_canonicalize(arch, args.misa_spec))
+  else:
+    parser.print_help()
+    sys.exit(1)
diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index 6531996..9695fdc 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1679,6 +1679,26 @@
 ;; Combine vec_duplicate + op.vv to op.vx
 ;; Include
 ;; - vadd.vx
+;; - vsub.vx
+;; - vrsub.vx
+;; - vand.vx
+;; - vor.vx
+;; - vmul.vx
+;; - vdiv.vx
+;; - vdivu.vx
+;; - vrem.vx
+;; - vremu.vx
+;; - vmax.vx
+;; - vmaxu.vx
+;; - vmin.vx
+;; - vminu.vx
+;; - vsadd.vx
+;; - vsaddu.vx
+;; - vssub.vx
+;; - vssubu.vx
+;; - vaadd.vx
+;; - vaaddu.vx
+;; - vmerge.vxm
 ;; =============================================================================
 (define_insn_and_split "*<optab>_vx_<mode>"
  [(set (match_operand:V_VLSI    0 "register_operand")
@@ -1694,6 +1714,8 @@
     riscv_vector::expand_vx_binary_vec_dup_vec (operands[0], operands[2],
 						operands[1], <CODE>,
 						<MODE>mode);
+
+    DONE;
   }
   [(set_attr "type" "vialu")])
 
@@ -1711,6 +1733,8 @@
     riscv_vector::expand_vx_binary_vec_vec_dup (operands[0], operands[1],
 						operands[2], <CODE>,
 						<MODE>mode);
+
+    DONE;
   }
   [(set_attr "type" "vialu")])
 
@@ -1782,6 +1806,69 @@
   }
   [(set_attr "type" "vaalu")])
 
+(define_insn_and_split "*merge_vx_<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand")
+   (if_then_else:V_VLSI
+    (match_operand:<VM>      3 "vector_mask_operand")
+    (vec_duplicate:V_VLSI
+     (match_operand:<VEL>    2 "reg_or_int_operand"))
+    (match_operand:V_VLSI    1 "register_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    insn_code icode = code_for_pred_merge_scalar (<MODE>mode);
+    riscv_vector::emit_vlmax_insn (icode, riscv_vector::MERGE_OP, operands);
+    DONE;
+  }
+  [(set_attr "type" "vimerge")])
+
+(define_insn_and_split "*vmacc_vx_<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand")
+   (plus:V_VLSI
+    (mult:V_VLSI
+     (vec_duplicate:V_VLSI
+      (match_operand:<VEL>   1 "register_operand"))
+     (match_operand:V_VLSI   2 "register_operand"))
+    (match_operand:V_VLSI    3 "register_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    insn_code icode = code_for_pred_mul_plus_vx (<MODE>mode);
+    rtx ops[] = {operands[0], operands[1], operands[2], operands[3],
+		 RVV_VUNDEF(<MODE>mode)};
+    riscv_vector::emit_vlmax_insn (icode, riscv_vector::TERNARY_OP, ops);
+
+    DONE;
+  }
+  [(set_attr "type" "vimuladd")])
+
+(define_insn_and_split "*vnmsac_vx_<mode>"
+ [(set (match_operand:V_VLSI 0 "register_operand")
+   (minus:V_VLSI
+    (match_operand:V_VLSI   3 "register_operand")
+    (mult:V_VLSI
+     (vec_duplicate:V_VLSI
+      (match_operand:<VEL>   1 "register_operand"))
+     (match_operand:V_VLSI   2 "register_operand"))))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    insn_code icode = code_for_pred_vnmsac_vx (<MODE>mode);
+    rtx ops[] = {operands[0], operands[1], operands[2], operands[3],
+		 RVV_VUNDEF(<MODE>mode)};
+    riscv_vector::emit_vlmax_insn (icode, riscv_vector::TERNARY_OP, ops);
+
+    DONE;
+  }
+  [(set_attr "type" "vimuladd")])
+
+
 ;; =============================================================================
 ;; Combine vec_duplicate + op.vv to op.vf
 ;; Include
@@ -1962,3 +2049,98 @@
   }
   [(set_attr "type" "vfwmuladd")]
 )
+
+;; vfmul.vf
+(define_insn_and_split "*vfmul_vf_<mode>"
+  [(set (match_operand:V_VLSF 0 "register_operand")
+    (mult:V_VLSF
+      (vec_duplicate:V_VLSF
+        (match_operand:<VEL> 2 "register_operand"))
+      (match_operand:V_VLSF 1 "register_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (MULT, <MODE>mode),
+				   riscv_vector::BINARY_OP_FRM_DYN, operands);
+    DONE;
+  }
+  [(set_attr "type" "vfmul")]
+)
+
+;; vfrdiv.vf
+(define_insn_and_split "*vfrdiv_vf_<mode>"
+  [(set (match_operand:V_VLSF 0 "register_operand")
+    (div:V_VLSF
+      (vec_duplicate:V_VLSF
+	(match_operand:<VEL> 2 "register_operand"))
+      (match_operand:V_VLSF 1 "register_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    riscv_vector::emit_vlmax_insn (code_for_pred_reverse_scalar (DIV, <MODE>mode),
+				   riscv_vector::BINARY_OP_FRM_DYN, operands);
+    DONE;
+  }
+  [(set_attr "type" "vfdiv")]
+)
+
+;; vfmin.vf
+(define_insn_and_split "*vfmin_vf_<mode>"
+  [(set (match_operand:V_VLSF 0 "register_operand")
+    (smin:V_VLSF
+      (vec_duplicate:V_VLSF
+	(match_operand:<VEL> 2 "register_operand"))
+      (match_operand:V_VLSF 1 "register_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (SMIN, <MODE>mode),
+				   riscv_vector::BINARY_OP, operands);
+    DONE;
+  }
+  [(set_attr "type" "vfminmax")]
+)
+
+(define_insn_and_split "*vfmin_vf_ieee_<mode>"
+  [(set (match_operand:V_VLSF 0 "register_operand")
+    (unspec:V_VLSF [
+      (vec_duplicate:V_VLSF
+	(match_operand:<VEL> 2 "register_operand"))
+      (match_operand:V_VLSF 1 "register_operand")
+      ] UNSPEC_VFMIN))]
+  "TARGET_VECTOR && !HONOR_SNANS (<MODE>mode) && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (UNSPEC_VFMIN, <MODE>mode),
+				   riscv_vector::BINARY_OP, operands);
+    DONE;
+  }
+  [(set_attr "type" "vfminmax")]
+)
+
+(define_insn_and_split "*vfmin_vf_ieee_<mode>"
+  [(set (match_operand:V_VLSF 0 "register_operand")
+    (unspec:V_VLSF [
+      (match_operand:V_VLSF 1 "register_operand")
+      (vec_duplicate:V_VLSF
+	(match_operand:<VEL> 2 "register_operand"))
+      ] UNSPEC_VFMIN))]
+  "TARGET_VECTOR && !HONOR_SNANS (<MODE>mode) && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (UNSPEC_VFMIN, <MODE>mode),
+				   riscv_vector::BINARY_OP, operands);
+    DONE;
+  }
+  [(set_attr "type" "vfminmax")]
+)
diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
index 5ecaa19..979e0df 100644
--- a/gcc/config/riscv/constraints.md
+++ b/gcc/config/riscv/constraints.md
@@ -330,3 +330,7 @@
 (define_constraint "Q"
   "An address operand that is valid for a prefetch instruction"
   (match_operand 0 "prefetch_operand"))
+
+(define_address_constraint "ZD"
+  "An address operand that is valid for a mips prefetch instruction"
+  (match_test "riscv_prefetch_offset_address_p (op, mode)"))
diff --git a/gcc/config/riscv/gen-riscv-ext-opt.cc b/gcc/config/riscv/gen-riscv-ext-opt.cc
index 17b8f5b..1ca339c 100644
--- a/gcc/config/riscv/gen-riscv-ext-opt.cc
+++ b/gcc/config/riscv/gen-riscv-ext-opt.cc
@@ -4,50 +4,6 @@
 #include <stdio.h>
 #include "riscv-opts.h"
 
-struct version_t
-{
-  int major;
-  int minor;
-  version_t (int major, int minor,
-	     enum riscv_isa_spec_class spec = ISA_SPEC_CLASS_NONE)
-    : major (major), minor (minor)
-  {}
-  bool operator<(const version_t &other) const
-  {
-    if (major != other.major)
-      return major < other.major;
-    return minor < other.minor;
-  }
-
-  bool operator== (const version_t &other) const
-  {
-    return major == other.major && minor == other.minor;
-  }
-};
-
-static void
-print_ext_doc_entry (const std::string &ext_name, const std::string &full_name,
-		     const std::string &desc,
-		     const std::vector<version_t> &supported_versions)
-{
-  // Implementation of the function to print the documentation entry
-  // for the extension.
-  std::set<version_t> unique_versions;
-  for (const auto &version : supported_versions)
-    unique_versions.insert (version);
-  printf ("@item %s\n", ext_name.c_str ());
-  printf ("@tab");
-  for (const auto &version : unique_versions)
-    {
-      printf (" %d.%d", version.major, version.minor);
-    }
-  printf ("\n");
-  printf ("@tab %s", full_name.c_str ());
-  if (desc.size ())
-    printf (", %s", desc.c_str ());
-  printf ("\n\n");
-}
-
 int
 main ()
 {
diff --git a/gcc/config/riscv/gen-riscv-mcpu-texi.cc b/gcc/config/riscv/gen-riscv-mcpu-texi.cc
new file mode 100644
index 0000000..9681438
--- /dev/null
+++ b/gcc/config/riscv/gen-riscv-mcpu-texi.cc
@@ -0,0 +1,43 @@
+#include <string>
+#include <vector>
+#include <stdio.h>
+
+int
+main ()
+{
+  puts ("@c Copyright (C) 2025 Free Software Foundation, Inc.");
+  puts ("@c This is part of the GCC manual.");
+  puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi.");
+  puts ("");
+  puts ("@c This file is generated automatically using");
+  puts ("@c  gcc/config/riscv/gen-riscv-mcpu-texi.cc from:");
+  puts ("@c       gcc/config/riscv/riscv-cores.def");
+  puts ("");
+  puts ("@c Please *DO NOT* edit manually.");
+  puts ("");
+  puts ("@samp{Core Name}");
+  puts ("");
+  puts ("@opindex mcpu");
+  puts ("@item -mcpu=@var{processor-string}");
+  puts ("Use architecture of and optimize the output for the given processor, specified");
+  puts ("by particular CPU name. Permissible values for this option are:");
+  puts ("");
+  puts ("");
+
+  std::vector<std::string> coreNames;
+
+#define RISCV_CORE(CORE_NAME, ARCH, MICRO_ARCH) \
+  coreNames.push_back (CORE_NAME);
+#include "riscv-cores.def"
+#undef RISCV_CORE
+
+  for (size_t i = 0; i < coreNames.size(); ++i) {
+    if (i == coreNames.size() - 1) {
+      printf("@samp{%s}.\n", coreNames[i].c_str());
+    } else {
+      printf("@samp{%s},\n\n", coreNames[i].c_str());
+    }
+  }
+
+  return 0;
+}
diff --git a/gcc/config/riscv/gen-riscv-mtune-texi.cc b/gcc/config/riscv/gen-riscv-mtune-texi.cc
new file mode 100644
index 0000000..1bdfe2a
--- /dev/null
+++ b/gcc/config/riscv/gen-riscv-mtune-texi.cc
@@ -0,0 +1,41 @@
+#include <string>
+#include <vector>
+#include <stdio.h>
+
+int
+main ()
+{
+  puts ("@c Copyright (C) 2025 Free Software Foundation, Inc.");
+  puts ("@c This is part of the GCC manual.");
+  puts ("@c For copying conditions, see the file gcc/doc/include/fdl.texi.");
+  puts ("");
+  puts ("@c This file is generated automatically using");
+  puts ("@c  gcc/config/riscv/gen-riscv-mtune-texi.cc from:");
+  puts ("@c       gcc/config/riscv/riscv-cores.def");
+  puts ("");
+  puts ("@c Please *DO NOT* edit manually.");
+  puts ("");
+  puts ("@samp{Tune Name}");
+  puts ("");
+  puts ("@opindex mtune");
+  puts ("@item -mtune=@var{processor-string}");
+  puts ("Optimize the output for the given processor, specified by microarchitecture or");
+  puts ("particular CPU name.  Permissible values for this option are:");
+  puts ("");
+  puts ("");
+
+  std::vector<std::string> tuneNames;
+
+#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO) \
+  tuneNames.push_back (TUNE_NAME);
+#include "riscv-cores.def"
+#undef RISCV_TUNE
+
+  for (size_t i = 0; i < tuneNames.size(); ++i) {
+    printf("@samp{%s},\n\n", tuneNames[i].c_str());
+  }
+
+  puts ("and all valid options for @option{-mcpu=}.");
+
+  return 0;
+}
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 381f96c..bdb3d22 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -27,10 +27,14 @@
   (ior (match_operand 0 "const_arith_operand")
        (match_operand 0 "register_operand")))
 
+(define_predicate "prefetch_const_operand"
+  (and (match_code "const_int")
+       (match_test "(IN_RANGE (INTVAL (op),  0, 511))")))
+
 ;; REG or REG+D where D fits in a simm12 and has the low 5 bits
 ;; off.  The REG+D form can be reloaded into a temporary if needed
 ;; after FP elimination if that exposes an invalid offset.
-(define_predicate "prefetch_operand"
+(define_predicate "zicbop_prefetch_operand"
   (ior (match_operand 0 "register_operand")
        (and (match_test "const_arith_operand (op, VOIDmode)")
 	    (match_test "(INTVAL (op) & 0x1f) == 0"))
@@ -39,6 +43,20 @@
 	    (match_test "const_arith_operand (XEXP (op, 1), VOIDmode)")
 	    (match_test "(INTVAL (XEXP (op, 1)) & 0x1f) == 0"))))
 
+;; REG or REG+D where D fits in a uimm9
+(define_predicate "mips_prefetch_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_test "prefetch_const_operand (op, VOIDmode)")
+       (and (match_code "plus")
+	(match_test "register_operand (XEXP (op, 0), word_mode)")
+	(match_test "prefetch_const_operand (XEXP (op, 1), VOIDmode)"))))
+
+;; MIPS specific or Standard RISCV Extension
+(define_predicate "prefetch_operand"
+  (if_then_else (match_test "TARGET_XMIPSCBOP")
+      (match_operand 0 "mips_prefetch_operand")
+      (match_operand 0 "zicbop_prefetch_operand")))
+
 (define_predicate "lui_operand"
   (and (match_code "const_int")
        (match_test "LUI_OPERAND (INTVAL (op))")))
diff --git a/gcc/config/riscv/riscv-avlprop.cc b/gcc/config/riscv/riscv-avlprop.cc
index 3031c29..b8547a7 100644
--- a/gcc/config/riscv/riscv-avlprop.cc
+++ b/gcc/config/riscv/riscv-avlprop.cc
@@ -156,6 +156,7 @@ get_insn_vtype_mode (rtx_insn *rinsn)
   extract_insn_cached (rinsn);
   int mode_idx = get_attr_mode_idx (rinsn);
   gcc_assert (mode_idx != INVALID_ATTRIBUTE);
+  gcc_assert (mode_idx < recog_data.n_operands);
   return GET_MODE (recog_data.operand[mode_idx]);
 }
 
@@ -205,6 +206,7 @@ simplify_replace_vlmax_avl (rtx_insn *rinsn, rtx new_avl)
     {
       int index = get_attr_avl_type_idx (rinsn);
       gcc_assert (index != INVALID_ATTRIBUTE);
+      gcc_assert (index < recog_data.n_operands);
       validate_change_or_fail (rinsn, recog_data.operand_loc[index],
 			       get_avl_type_rtx (avl_type::NONVLMAX), false);
     }
@@ -361,6 +363,8 @@ pass_avlprop::get_vlmax_ta_preferred_avl (insn_info *insn) const
 	     is not depend on.  */
 	  extract_insn_cached (use_insn->rtl ());
 	  int merge_op_idx = get_attr_merge_op_idx (use_insn->rtl ());
+	  gcc_assert (merge_op_idx == INVALID_ATTRIBUTE
+		      || merge_op_idx < recog_data.n_operands);
 	  if (merge_op_idx != INVALID_ATTRIBUTE
 	      && !satisfies_constraint_vu (recog_data.operand[merge_op_idx])
 	      && refers_to_regno_p (set->regno (),
@@ -531,7 +535,14 @@ pass_avlprop::execute (function *fn)
 	      && !m_avl_propagations->get (candidate.second)
 	      && imm_avl_p (vtype_mode))
 	    {
-	      rtx new_avl = gen_int_mode (GET_MODE_NUNITS (vtype_mode), Pmode);
+	      /* For segmented operations AVL refers to a single register and
+		 not all NF registers.  Therefore divide the mode size by NF
+		 to obtain the proper AVL.  */
+	      int nf = 1;
+	      if (riscv_v_ext_tuple_mode_p (vtype_mode))
+		nf = get_nf (vtype_mode);
+	      rtx new_avl = gen_int_mode
+	      (GET_MODE_NUNITS (vtype_mode).to_constant () / nf, Pmode);
 	      simplify_replace_vlmax_avl (rinsn, new_avl);
 	    }
 	}
diff --git a/gcc/config/riscv/riscv-cores.def b/gcc/config/riscv/riscv-cores.def
index 98f3470..8f0f630 100644
--- a/gcc/config/riscv/riscv-cores.def
+++ b/gcc/config/riscv/riscv-cores.def
@@ -113,7 +113,7 @@ RISCV_CORE("xt-c908v",        "rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicsr_"
 			      "zvfh_sstc_svinval_svnapot_svpbmt__xtheadba_"
 			      "xtheadbb_xtheadbs_xtheadcmo_xtheadcondmov_"
 			      "xtheadfmemidx_xtheadmac_xtheadmemidx_"
-			      "xtheadmempair_xtheadsync_xtheadvdot",
+			      "xtheadmempair_xtheadsync",
 			      "xt-c908")
 RISCV_CORE("xt-c910",         "rv64imafdc_zicntr_zicsr_zifencei_zihpm_zfh_"
 			      "xtheadba_xtheadbb_xtheadbs_xtheadcmo_"
@@ -121,7 +121,7 @@ RISCV_CORE("xt-c910",         "rv64imafdc_zicntr_zicsr_zifencei_zihpm_zfh_"
 			      "xtheadmemidx_xtheadmempair_xtheadsync",
 			      "xt-c910")
 RISCV_CORE("xt-c910v2",       "rv64imafdc_zicbom_zicbop_zicboz_zicntr_zicond_"
-			      "zicsr_zifencei _zihintntl_zihintpause_zihpm_"
+			      "zicsr_zifencei_zihintntl_zihintpause_zihpm_"
 			      "zawrs_zfa_zfbfmin_zfh_zca_zcb_zcd_zba_zbb_zbc_"
 			      "zbs_sscofpmf_sstc_svinval_svnapot_svpbmt_"
 			      "xtheadba_xtheadbb_xtheadbs_xtheadcmo_"
@@ -135,13 +135,13 @@ RISCV_CORE("xt-c920",         "rv64imafdc_zicntr_zicsr_zifencei_zihpm_zfh_"
 			      "xtheadvector",
 			      "xt-c910")
 RISCV_CORE("xt-c920v2",       "rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicond_"
-			      "zicsr_zifencei _zihintntl_zihintpause_zihpm_"
+			      "zicsr_zifencei_zihintntl_zihintpause_zihpm_"
 			      "zawrs_zfa_zfbfmin_zfh_zca_zcb_zcd_zba_zbb_zbc_"
 			      "zbs_zvfbfmin_zvfbfwma_zvfh_sscofpmf_sstc_"
 			      "svinval_svnapot_svpbmt_xtheadba_xtheadbb_"
 			      "xtheadbs_xtheadcmo_xtheadcondmov_xtheadfmemidx_"
 			      "xtheadmac_xtheadmemidx_xtheadmempair_"
-			      "xtheadsync_xtheadvdot",
+			      "xtheadsync",
 			       "xt-c920v2")
 
 RISCV_CORE("tt-ascalon-d8",   "rv64imafdcv_zic64b_zicbom_zicbop_zicboz_"
diff --git a/gcc/config/riscv/riscv-ext-mips.def b/gcc/config/riscv/riscv-ext-mips.def
index 5d7836d..132f6c1 100644
--- a/gcc/config/riscv/riscv-ext-mips.def
+++ b/gcc/config/riscv/riscv-ext-mips.def
@@ -33,3 +33,16 @@ DEFINE_RISCV_EXT (
   /* BITMASK_GROUP_ID.  */ BITMASK_NOT_YET_ALLOCATED,
   /* BITMASK_BIT_POSITION.  */ BITMASK_NOT_YET_ALLOCATED,
   /* EXTRA_EXTENSION_FLAGS.  */ 0)
+
+DEFINE_RISCV_EXT (
+  /* NAME.  */ xmipscbop,
+  /* UPPERCASE_NAME.  */ XMIPSCBOP,
+  /* FULL_NAME.  */ "Mips Prefetch extension",
+  /* DESC.  */ "",
+  /* URL.  */ ,
+  /* DEP_EXTS.  */ ({}),
+  /* SUPPORTED_VERSIONS.  */ ({{1, 0}}),
+  /* FLAG_GROUP.  */ xmips,
+  /* BITMASK_GROUP_ID.  */ BITMASK_NOT_YET_ALLOCATED,
+  /* BITMASK_BIT_POSITION.  */ BITMASK_NOT_YET_ALLOCATED,
+  /* EXTRA_EXTENSION_FLAGS.  */ 0)
diff --git a/gcc/config/riscv/riscv-ext.opt b/gcc/config/riscv/riscv-ext.opt
index 26d6e68..ced05d2 100644
--- a/gcc/config/riscv/riscv-ext.opt
+++ b/gcc/config/riscv/riscv-ext.opt
@@ -449,3 +449,5 @@ Mask(XTHEADVECTOR) Var(riscv_xthead_subext)
 Mask(XVENTANACONDOPS) Var(riscv_xventana_subext)
 
 Mask(XMIPSCMOV) Var(riscv_xmips_subext)
+
+Mask(XMIPSCBOP) Var(riscv_xmips_subext)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 539321f..46b256d 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -143,6 +143,8 @@ extern void riscv_expand_sstrunc (rtx, rtx);
 extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t);
 extern bool synthesize_ior_xor (rtx_code, rtx [3]);
 extern bool synthesize_and (rtx [3]);
+extern bool synthesize_add (rtx [3]);
+extern bool synthesize_add_extended (rtx [3]);
 
 #ifdef RTX_CODE
 extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0);
@@ -830,16 +832,18 @@ extern bool th_print_operand_address (FILE *, machine_mode, rtx);
 
 extern bool strided_load_broadcast_p (void);
 extern bool riscv_use_divmod_expander (void);
-void riscv_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
+void riscv_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx, tree, int);
 extern bool
 riscv_option_valid_attribute_p (tree, tree, tree, int);
 extern bool
 riscv_option_valid_version_attribute_p (tree, tree, tree, int);
 extern bool
-riscv_process_target_version_attr (tree, location_t);
+riscv_process_target_version_attr (tree, location_t *);
 extern void
 riscv_override_options_internal (struct gcc_options *);
 extern void riscv_option_override (void);
+extern rtx riscv_prefetch_cookie (rtx, rtx);
+extern bool riscv_prefetch_offset_address_p (rtx, machine_mode);
 
 struct riscv_tune_param;
 /* Information about one micro-arch we know about.  */
diff --git a/gcc/config/riscv/riscv-subset.h b/gcc/config/riscv/riscv-subset.h
index a35537d..4cd860f 100644
--- a/gcc/config/riscv/riscv-subset.h
+++ b/gcc/config/riscv/riscv-subset.h
@@ -52,8 +52,9 @@ private:
   /* Original arch string.  */
   const char *m_arch;
 
-  /* Location of arch string, used for report error.  */
-  location_t m_loc;
+  /* A pointer to the location that should be used for diagnostics,
+     or null if diagnostics should be suppressed.  */
+  location_t *m_loc;
 
   /* Head of subset info list.  */
   riscv_subset_t *m_head;
@@ -70,7 +71,7 @@ private:
   /* Allow adding the same extension more than once.  */
   bool m_allow_adding_dup;
 
-  riscv_subset_list (const char *, location_t);
+  riscv_subset_list (const char *, location_t *);
 
   const char *parsing_subset_version (const char *, const char *, unsigned *,
 				      unsigned *, bool, bool *);
@@ -106,12 +107,12 @@ public:
 
   riscv_subset_list *clone () const;
 
-  static riscv_subset_list *parse (const char *, location_t);
+  static riscv_subset_list *parse (const char *, location_t *);
   const char *parse_single_ext (const char *, bool exact_single_p = true);
 
   int match_score (riscv_subset_list *) const;
 
-  void set_loc (location_t);
+  void set_loc (location_t *);
 
   void set_allow_adding_dup (bool v) { m_allow_adding_dup = v; }
 
@@ -182,7 +183,7 @@ extern void
 riscv_set_arch_by_subset_list (riscv_subset_list *, struct gcc_options *);
 extern bool riscv_minimal_hwprobe_feature_bits (const char *,
 						struct riscv_feature_bits *,
-						location_t);
+						location_t *);
 extern bool
 riscv_ext_is_subset (struct cl_target_option *, struct cl_target_option *);
 
diff --git a/gcc/config/riscv/riscv-target-attr.cc b/gcc/config/riscv/riscv-target-attr.cc
index 8ad3025..5e01c92 100644
--- a/gcc/config/riscv/riscv-target-attr.cc
+++ b/gcc/config/riscv/riscv-target-attr.cc
@@ -34,7 +34,7 @@ namespace {
 class riscv_target_attr_parser
 {
 public:
-  riscv_target_attr_parser (location_t loc)
+  riscv_target_attr_parser (location_t *loc)
     : m_found_arch_p (false)
     , m_found_tune_p (false)
     , m_found_cpu_p (false)
@@ -62,7 +62,7 @@ private:
   bool m_found_cpu_p;
   bool m_found_priority_p;
   riscv_subset_list *m_subset_list;
-  location_t m_loc;
+  location_t *m_loc;
   const  riscv_cpu_info *m_cpu_info;
   const char *m_tune;
   int m_priority;
@@ -102,15 +102,17 @@ riscv_target_attr_parser::parse_arch (const char *str)
     {
       if (TARGET_64BIT && strncmp ("32", str + 2, strlen ("32")) == 0)
 	{
-	  error_at (m_loc, "unexpected arch for %<target()%> attribute: "
-		    "must start with rv64 but found %qs", str);
+	  if (m_loc)
+	    error_at (*m_loc, "unexpected arch for %<target()%> attribute: "
+		      "must start with rv64 but found %qs", str);
 	  goto fail;
 	}
 
       if (!TARGET_64BIT && strncmp ("64", str + 2, strlen ("64")) == 0)
 	{
-	  error_at (m_loc, "unexpected arch for %<target()%> attribute: "
-		    "must start with rv32 but found %qs", str);
+	  if (m_loc)
+	    error_at (*m_loc, "unexpected arch for %<target()%> attribute: "
+		      "must start with rv32 but found %qs", str);
 	  goto fail;
 	}
 
@@ -140,10 +142,9 @@ riscv_target_attr_parser::parse_arch (const char *str)
 	{
 	  if (token[0] != '+')
 	    {
-	      error_at (
-		m_loc,
-		"unexpected arch for %<target()%> attribute: must start "
-		"with + or rv");
+	      if (*m_loc)
+		error_at (*m_loc, "unexpected arch for %<target()%> "
+			  "attribute: must start with + or rv");
 	      goto fail;
 	    }
 
@@ -151,10 +152,9 @@ riscv_target_attr_parser::parse_arch (const char *str)
 	  /* Check parse_single_ext has consume all string.  */
 	  if (*result != '\0')
 	    {
-	      error_at (
-		m_loc,
-		"unexpected arch for %<target()%> attribute: bad "
-		"string found %qs", token);
+	      if (m_loc)
+		error_at (*m_loc, "unexpected arch for %<target()%> "
+			  "attribute: bad string found %qs", token);
 	      goto fail;
 	    }
 
@@ -179,8 +179,8 @@ fail:
 bool
 riscv_target_attr_parser::handle_arch (const char *str)
 {
-  if (m_found_arch_p)
-    error_at (m_loc, "%<target()%> attribute: arch appears more than once");
+  if (m_found_arch_p && m_loc)
+    error_at (*m_loc, "%<target()%> attribute: arch appears more than once");
   m_found_arch_p = true;
   return parse_arch (str);
 }
@@ -190,15 +190,16 @@ riscv_target_attr_parser::handle_arch (const char *str)
 bool
 riscv_target_attr_parser::handle_cpu (const char *str)
 {
-  if (m_found_cpu_p)
-    error_at (m_loc, "%<target()%> attribute: cpu appears more than once");
+  if (m_found_cpu_p && m_loc)
+    error_at (*m_loc, "%<target()%> attribute: cpu appears more than once");
 
   m_found_cpu_p = true;
   const riscv_cpu_info *cpu_info = riscv_find_cpu (str);
 
   if (!cpu_info)
     {
-      error_at (m_loc, "%<target()%> attribute: unknown CPU %qs", str);
+      if (m_loc)
+	error_at (*m_loc, "%<target()%> attribute: unknown CPU %qs", str);
       return false;
     }
 
@@ -218,14 +219,15 @@ riscv_target_attr_parser::handle_cpu (const char *str)
 bool
 riscv_target_attr_parser::handle_tune (const char *str)
 {
-  if (m_found_tune_p)
-    error_at (m_loc, "%<target()%> attribute: tune appears more than once");
+  if (m_found_tune_p && m_loc)
+    error_at (*m_loc, "%<target()%> attribute: tune appears more than once");
   m_found_tune_p = true;
   const struct riscv_tune_info *tune = riscv_parse_tune (str, true);
 
   if (tune == nullptr)
     {
-      error_at (m_loc, "%<target()%> attribute: unknown TUNE %qs", str);
+      if (m_loc)
+	error_at (*m_loc, "%<target()%> attribute: unknown TUNE %qs", str);
       return false;
     }
 
@@ -237,13 +239,15 @@ riscv_target_attr_parser::handle_tune (const char *str)
 bool
 riscv_target_attr_parser::handle_priority (const char *str)
 {
-  if (m_found_priority_p)
-    error_at (m_loc, "%<target()%> attribute: priority appears more than once");
+  if (m_found_priority_p && m_loc)
+    error_at (*m_loc, "%<target()%> attribute: priority appears "
+	      "more than once");
   m_found_priority_p = true;
 
   if (sscanf (str, "%d", &m_priority) != 1)
     {
-      error_at (m_loc, "%<target()%> attribute: invalid priority %qs", str);
+      if (m_loc)
+	error_at (*m_loc, "%<target()%> attribute: invalid priority %qs", str);
       return false;
     }
 
@@ -282,7 +286,7 @@ riscv_target_attr_parser::update_settings (struct gcc_options *opts) const
 
 static bool
 riscv_process_one_target_attr (char *arg_str,
-			       location_t loc,
+			       location_t *loc,
 			       riscv_target_attr_parser &attr_parser,
 			       const struct riscv_attribute_info *attrs)
 {
@@ -290,7 +294,8 @@ riscv_process_one_target_attr (char *arg_str,
 
   if (len == 0)
     {
-      error_at (loc, "malformed %<target()%> attribute");
+      if (loc)
+	error_at (*loc, "malformed %<target()%> attribute");
       return false;
     }
 
@@ -302,10 +307,9 @@ riscv_process_one_target_attr (char *arg_str,
 
   if (!arg)
     {
-      error_at (
-	loc,
-	"attribute %<target(\"%s\")%> does not accept an argument",
-	str_to_check);
+      if (loc)
+	error_at (*loc, "attribute %<target(\"%s\")%> does not "
+		  "accept an argument", str_to_check);
       return false;
     }
 
@@ -324,7 +328,8 @@ riscv_process_one_target_attr (char *arg_str,
       return (&attr_parser->*attr->handler) (arg);
     }
 
-  error_at (loc, "Got unknown attribute %<target(\"%s\")%>", str_to_check);
+  if (loc)
+    error_at (*loc, "Got unknown attribute %<target(\"%s\")%>", str_to_check);
   return false;
 }
 
@@ -347,11 +352,12 @@ num_occurrences_in_str (char c, char *str)
 }
 
 /* Parse the string in ARGS that contains the target attribute information
-   and update the global target options space.  */
+   and update the global target options space.  If LOC is nonnull, report
+   diagnostics against location *LOC, otherwise remain silent.  */
 
 bool
 riscv_process_target_attr (const char *args,
-			   location_t loc,
+			   location_t *loc,
 			   const struct riscv_attribute_info *attrs)
 {
   size_t len = strlen (args);
@@ -387,8 +393,8 @@ riscv_process_target_attr (const char *args,
 
   if (num_attrs != num_semicolons + 1)
     {
-      error_at (loc, "malformed %<target(\"%s\")%> attribute",
-		args);
+      if (loc)
+	error_at (*loc, "malformed %<target(\"%s\")%> attribute", args);
       return false;
     }
 
@@ -399,11 +405,12 @@ riscv_process_target_attr (const char *args,
 }
 
 /* Parse the tree in ARGS that contains the target attribute information
-   and update the global target options space.  */
+   and update the global target options space.  If LOC is nonnull, report
+   diagnostics against *LOC, otherwise remain silent.  */
 
 static bool
 riscv_process_target_attr (tree args,
-			   location_t loc,
+			   location_t *loc,
 			   const struct riscv_attribute_info *attrs)
 {
   if (TREE_CODE (args) == TREE_LIST)
@@ -424,7 +431,8 @@ riscv_process_target_attr (tree args,
 
   if (TREE_CODE (args) != STRING_CST)
     {
-      error_at (loc, "attribute %<target%> argument not a string");
+      if (loc)
+	error_at (*loc, "attribute %<target%> argument not a string");
       return false;
     }
 
@@ -466,7 +474,7 @@ riscv_option_valid_attribute_p (tree fndecl, tree, tree args, int)
 			      TREE_TARGET_OPTION (target_option_default_node));
 
   /* Now we can parse the attributes and set &global_options accordingly.  */
-  ret = riscv_process_target_attr (args, loc, riscv_target_attrs);
+  ret = riscv_process_target_attr (args, &loc, riscv_target_attrs);
   if (ret)
     {
       riscv_override_options_internal (&global_options);
@@ -481,16 +489,19 @@ riscv_option_valid_attribute_p (tree fndecl, tree, tree args, int)
 }
 
 /* Parse the tree in ARGS that contains the target_version attribute
-   information and update the global target options space.  */
+   information and update the global target options space.  If LOC is nonnull,
+   report diagnostics against *LOC, otherwise remain silent.  */
 
 bool
-riscv_process_target_version_attr (tree args, location_t loc)
+riscv_process_target_version_attr (tree args, location_t *loc)
 {
   if (TREE_CODE (args) == TREE_LIST)
     {
       if (TREE_CHAIN (args))
 	{
-	  error ("attribute %<target_version%> has multiple values");
+	  if (loc)
+	    error_at (*loc, "attribute %<target_version%> "
+		      "has multiple values");
 	  return false;
 	}
       args = TREE_VALUE (args);
@@ -498,7 +509,8 @@ riscv_process_target_version_attr (tree args, location_t loc)
 
   if (!args || TREE_CODE (args) != STRING_CST)
     {
-      error ("attribute %<target_version%> argument not a string");
+      if (loc)
+	error_at (*loc, "attribute %<target_version%> argument not a string");
       return false;
     }
 
@@ -541,7 +553,7 @@ riscv_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
     cl_target_option_restore (&global_options, &global_options_set,
 			      TREE_TARGET_OPTION (target_option_current_node));
 
-  ret = riscv_process_target_version_attr (args, loc);
+  ret = riscv_process_target_version_attr (args, &loc);
 
   /* Set up any additional state.  */
   if (ret)
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index c9c8328..b27a0be 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -63,20 +63,37 @@ imm_avl_p (machine_mode mode)
 {
   poly_uint64 nunits = GET_MODE_NUNITS (mode);
 
+  /* For segmented operations AVL refers to a single register and not all NF
+     registers.  Therefore divide the mode size by NF before checking if it is
+     in range.  */
+  int nf = 1;
+  if (riscv_v_ext_tuple_mode_p (mode))
+    nf = get_nf (mode);
+
   return nunits.is_constant ()
 	   /* The vsetivli can only hold register 0~31.  */
-	   ? (IN_RANGE (nunits.to_constant (), 0, 31))
+	   ? (IN_RANGE (nunits.to_constant () / nf, 0, 31))
 	   /* Only allowed in VLS-VLMAX mode.  */
 	   : false;
 }
 
-/* Return true if LEN is equal to NUNITS that out of the range [0, 31].  */
+/* Return true if LEN equals the number of units in MODE if MODE is either a
+   VLA mode or MODE is a VLS mode its size equals the vector size.
+   In that case we can emit a VLMAX insn which can be optimized more easily
+   by the vsetvl pass.  */
+
 static bool
 is_vlmax_len_p (machine_mode mode, rtx len)
 {
   poly_int64 value;
+  if (poly_int_rtx_p (len, &value)
+      && known_eq (value, GET_MODE_NUNITS (mode))
+      && known_eq (GET_MODE_UNIT_SIZE (mode) * value, BYTES_PER_RISCV_VECTOR))
+    return true;
+
   return poly_int_rtx_p (len, &value)
-	 && known_eq (value, GET_MODE_NUNITS (mode));
+    && !GET_MODE_NUNITS (mode).is_constant ()
+    && known_eq (value, GET_MODE_NUNITS (mode));
 }
 
 /* Helper functions for insn_flags && insn_types */
@@ -954,6 +971,26 @@ emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
   emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
 }
 
+/* Function to emit a vslide1up instruction of mode MODE with destination
+   DEST and slideup element ELT.  */
+
+rtx
+expand_slide1up (machine_mode mode, rtx dest, rtx elt)
+{
+  unsigned int unspec
+    = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
+  insn_code icode = code_for_pred_slide (unspec, mode);
+  /* RVV Spec 16.3.1
+     The destination vector register group for vslideup cannot overlap the
+     source vector register group, otherwise the instruction encoding
+     is reserved.  Thus, we need a new register.  */
+  rtx tmp = gen_reg_rtx (mode);
+  rtx ops[] = {tmp, dest, elt};
+  emit_vlmax_insn (icode, BINARY_OP, ops);
+  return tmp;
+}
+
+
 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
    https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
 
@@ -1175,16 +1212,7 @@ expand_vector_init_trailing_same_elem (rtx target,
     {
       rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
       for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
-	{
-	  unsigned int unspec
-	    = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
-	  insn_code icode = code_for_pred_slide (unspec, mode);
-	  rtx tmp = gen_reg_rtx (mode);
-	  rtx ops[] = {tmp, dup, builder.elt (i)};
-	  emit_vlmax_insn (icode, BINARY_OP, ops);
-	  /* slide1up need source and dest to be different REG.  */
-	  dup = tmp;
-	}
+	dup = expand_slide1up (mode, dup, builder.elt (i));
 
       emit_move_insn (target, dup);
       return true;
@@ -1717,6 +1745,77 @@ expand_const_vector_stepped (rtx target, rtx src, rvv_builder *builder)
   gcc_unreachable ();
 }
 
+/* We don't actually allow this case in legitimate_constant_p but
+   the middle-end still expects us to handle it in an expander
+   (see PR121334).  This is assumed to happen very rarely so the
+   implementation is not very efficient, particularly
+   for short vectors.
+*/
+
+static void
+expand_const_vector_onestep (rtx target, rvv_builder &builder)
+{
+  machine_mode mode = GET_MODE (target);
+  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
+  gcc_assert (builder.nelts_per_pattern () == 2);
+
+  /* We have n encoded patterns
+       {csta_0, cstb_0},
+       {csta_1, cstb_1},
+       ...
+       {csta_{n-1}, cstb_{n-1}}
+     which should become one vector:
+       {csta_0, csta_1, ..., csta_{n-1},
+	cstb_0, cstb_1, ..., cstb_{n-1},
+	...
+	cstb_0, cstb_1, ..., cstb_{n-1}}.
+
+     In order to achieve this we create a permute/gather constant
+	sel = {0, 1, ..., n - 1, 0, 1, ..., n - 1, ...}
+     and two vectors
+	va = {csta_0, csta_1, ..., csta_{n-1}},
+	vb = {cstb_0, cstb_1, ..., cstb_{n-1}}.
+
+     Then we use a VLMAX gather to "broadcast" vb and afterwards
+     overwrite the first n elements with va.  */
+
+  int n = builder.npatterns ();
+  /* { 0, 1, 2, ..., n - 1 }.  */
+  rtx vid = gen_reg_rtx (mode);
+  expand_vec_series (vid, const0_rtx, const1_rtx);
+
+  /* { 0, 1, ..., n - 1, 0, 1, ..., n - 1, ... }.  */
+  rtx sel = gen_reg_rtx (mode);
+  rtx and_ops[] = {sel, vid, GEN_INT (n)};
+  emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP, and_ops);
+
+  /* va = { ELT (0), ELT (1), ... ELT (n - 1) }.  */
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx ops1[] = {tmp1, builder.elt (0)};
+  expand_broadcast (mode, ops1);
+  for (int i = 1; i < n; i++)
+    tmp1 = expand_slide1up (mode, tmp1, builder.elt (i));
+
+  /* vb = { ELT (n), ELT (n + 1), ... ELT (2 * n - 1) }.  */
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx ops2[] = {tmp2, builder.elt (n)};
+  expand_broadcast (mode, ops2);
+  for (int i = 1; i < n; i++)
+    tmp2 = expand_slide1up (mode, tmp2, builder.elt (n + i));
+
+  /* Duplicate vb.  */
+  rtx tmp3 = gen_reg_rtx (mode);
+  emit_vlmax_gather_insn (tmp3, tmp2, sel);
+
+  /* Overwrite the first n - 1 elements with va.  */
+  rtx dest = gen_reg_rtx (mode);
+  insn_code icode = code_for_pred_mov (mode);
+  rtx ops3[] = {dest, tmp3, tmp1};
+  emit_nonvlmax_insn (icode, __MASK_OP_TUMA | UNARY_OP_P, ops3, GEN_INT (n));
+
+  emit_move_insn (target, dest);
+}
+
 static void
 expand_const_vector (rtx target, rtx src)
 {
@@ -1744,6 +1843,8 @@ expand_const_vector (rtx target, rtx src)
 
   if (CONST_VECTOR_DUPLICATE_P (src))
     return expand_const_vector_duplicate (target, &builder);
+  else if (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2)
+    return expand_const_vector_onestep (target, builder);
   else if (CONST_VECTOR_STEPPED_P (src))
     return expand_const_vector_stepped (target, src, &builder);
 
@@ -2648,8 +2749,14 @@ expand_vector_init_merge_repeating_sequence (rtx target,
     = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
   uint64_t full_nelts = builder.full_nelts ().to_constant ();
 
+  gcc_assert (builder.nelts_per_pattern () == 1
+	      || builder.nelts_per_pattern () == 2);
+
+  rtx first
+    = builder.nelts_per_pattern () == 1 ? builder.elt (0) : builder.elt (1);
+
   /* Step 1: Broadcast the first pattern.  */
-  rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
+  rtx ops[] = {target, force_reg (builder.inner_mode (), first)};
   expand_broadcast (builder.mode (), ops);
   /* Step 2: Merge the rest iteration of pattern.  */
   for (unsigned int i = 1; i < builder.npatterns (); i++)
@@ -2677,7 +2784,10 @@ expand_vector_init_merge_repeating_sequence (rtx target,
       emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
 
       /* Step 2-2: Merge pattern according to the mask.  */
-      rtx ops[] = {target, target, builder.elt (i), mask};
+      unsigned int which = i;
+      if (builder.nelts_per_pattern () == 2)
+	which = 2 * which + 1;
+      rtx ops[] = {target, target, builder.elt (which), mask};
       emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
 			MERGE_OP, ops);
     }
@@ -3220,15 +3330,17 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
   mask_mode = get_mask_mode (data_mode);
   rtx mask = gen_reg_rtx (mask_mode);
   rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
+  bool overlap = reg_overlap_mentioned_p (target, op1);
+  rtx tmp_target = overlap ? gen_reg_rtx (data_mode) : target;
 
   /* Step 1: generate a mask that should select everything >= nunits into the
    * mask.  */
   expand_vec_cmp (mask, GEU, sel_mod, max_sel);
 
-  /* Step2: gather every op0 values indexed by sel into target,
+  /* Step2: gather every op0 values indexed by sel into TMP_TARGET,
 	    we don't need to care about the result of the element
 	    whose index >= nunits.  */
-  emit_vlmax_gather_insn (target, op0, sel_mod);
+  emit_vlmax_gather_insn (tmp_target, op0, sel_mod);
 
   /* Step3: shift the range from (nunits, max_of_mode] to
 	    [0, max_of_mode - nunits].  */
@@ -3238,7 +3350,10 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
 
   /* Step4: gather those into the previously masked-out elements
 	    of target.  */
-  emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
+  emit_vlmax_masked_gather_mu_insn (tmp_target, op1, tmp, mask);
+
+  if (overlap)
+    emit_move_insn (tmp_target, target);
 }
 
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV.  */
@@ -4078,11 +4193,7 @@ shuffle_off_by_one_patterns (struct expand_vec_perm_d *d)
       emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
 
       /* Insert the scalar into element 0.  */
-      unsigned int unspec
-	= FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
-      insn_code icode = code_for_pred_slide (unspec, d->vmode);
-      rtx ops[] = {d->target, d->op1, tmp};
-      emit_vlmax_insn (icode, BINARY_OP, ops);
+      expand_slide1up (d->vmode, d->op1, tmp);
     }
 
   return true;
@@ -4376,13 +4487,11 @@ expand_strided_load (machine_mode mode, rtx *ops)
   int idx = 4;
   get_else_operand (ops[idx++]);
   rtx len = ops[idx];
-  poly_int64 len_val;
 
   insn_code icode = code_for_pred_strided_load (mode);
   rtx emit_ops[] = {v_reg, mask, gen_rtx_MEM (mode, base), stride};
 
-  if (poly_int_rtx_p (len, &len_val)
-      && known_eq (len_val, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
     emit_vlmax_insn (icode, BINARY_OP_TAMA, emit_ops);
   else
     {
@@ -4400,11 +4509,9 @@ expand_strided_store (machine_mode mode, rtx *ops)
   rtx stride = ops[1];
   rtx mask = ops[3];
   rtx len = ops[4];
-  poly_int64 len_val;
   rtx vl_type;
 
-  if (poly_int_rtx_p (len, &len_val)
-      && known_eq (len_val, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
     {
       len = gen_reg_rtx (Pmode);
       emit_vlmax_vsetvl (mode, len);
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index df924fa..5e6cb67 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -275,13 +275,13 @@ loop_invariant_op_p (class loop *loop,
 /* Return true if the variable should be counted into liveness.  */
 static bool
 variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info,
-		       slp_tree node ATTRIBUTE_UNUSED, tree var, bool lhs_p)
+		       slp_tree node, tree var, bool lhs_p)
 {
   if (!var)
     return false;
   gimple *stmt = STMT_VINFO_STMT (stmt_info);
   stmt_info = vect_stmt_to_vectorize (stmt_info);
-  enum stmt_vec_info_type type = STMT_VINFO_TYPE (stmt_info);
+  enum stmt_vec_info_type type = SLP_TREE_TYPE (node);
   if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
     {
       if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE
@@ -400,7 +400,7 @@ costs::compute_local_live_ranges (
 		  pair &live_range
 		    = live_ranges->get_or_insert (lhs, &existed_p);
 		  gcc_assert (!existed_p);
-		  if (STMT_VINFO_MEMORY_ACCESS_TYPE (program_point.stmt_info)
+		  if (SLP_TREE_MEMORY_ACCESS_TYPE (*node)
 		      == VMAT_LOAD_STORE_LANES)
 		    point = get_first_lane_point (program_points,
 						  program_point.stmt_info);
@@ -418,8 +418,7 @@ costs::compute_local_live_ranges (
 		      bool existed_p = false;
 		      pair &live_range
 			= live_ranges->get_or_insert (var, &existed_p);
-		      if (STMT_VINFO_MEMORY_ACCESS_TYPE (
-			    program_point.stmt_info)
+		      if (SLP_TREE_MEMORY_ACCESS_TYPE (*node)
 			  == VMAT_LOAD_STORE_LANES)
 			point = get_last_lane_point (program_points,
 						     program_point.stmt_info);
@@ -602,13 +601,13 @@ get_store_value (gimple *stmt)
 /* Return true if additional vector vars needed.  */
 bool
 costs::need_additional_vector_vars_p (stmt_vec_info stmt_info,
-				      slp_tree node ATTRIBUTE_UNUSED)
+				      slp_tree node)
 {
-  enum stmt_vec_info_type type = STMT_VINFO_TYPE (stmt_info);
+  enum stmt_vec_info_type type = SLP_TREE_TYPE (node);
   if (type == load_vec_info_type || type == store_vec_info_type)
     {
       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
-	  && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+	  && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))
 	return true;
 
       machine_mode mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
@@ -694,7 +693,7 @@ costs::update_local_live_ranges (
 	  if (!node)
 	    continue;
 
-	  if (STMT_VINFO_TYPE (stmt_info) == undef_vec_info_type)
+	  if (SLP_TREE_TYPE (*node) == undef_vec_info_type)
 	    continue;
 
 	  for (j = 0; j < gimple_phi_num_args (phi); j++)
@@ -773,7 +772,7 @@ costs::update_local_live_ranges (
 	  slp_tree *node = vinfo_slp_map.get (stmt_info);
 	  if (!node)
 	    continue;
-	  enum stmt_vec_info_type type = STMT_VINFO_TYPE (stmt_info);
+	  enum stmt_vec_info_type type = SLP_TREE_TYPE (*node);
 	  if (need_additional_vector_vars_p (stmt_info, *node))
 	    {
 	      /* For non-adjacent load/store STMT, we will potentially
@@ -1086,7 +1085,7 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
    load/store.  */
 static int
 segment_loadstore_group_size (enum vect_cost_for_stmt kind,
-			      stmt_vec_info stmt_info)
+			      stmt_vec_info stmt_info, slp_tree node)
 {
   if (stmt_info
       && (kind == vector_load || kind == vector_store)
@@ -1094,7 +1093,7 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind,
     {
       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
       if (stmt_info
-	  && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+	  && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES)
 	return DR_GROUP_SIZE (stmt_info);
     }
   return 0;
@@ -1108,7 +1107,7 @@ segment_loadstore_group_size (enum vect_cost_for_stmt kind,
 unsigned
 costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
 			 stmt_vec_info stmt_info,
-			 slp_tree, tree vectype, int stmt_cost)
+			 slp_tree node, tree vectype, int stmt_cost)
 {
   const cpu_vector_cost *costs = get_vector_costs ();
   switch (kind)
@@ -1131,7 +1130,8 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
 		 each vector in the group.  Here we additionally add permute
 		 costs for each.  */
 	      /* TODO: Indexed and ordered/unordered cost.  */
-	      int group_size = segment_loadstore_group_size (kind, stmt_info);
+	      int group_size = segment_loadstore_group_size (kind, stmt_info,
+							     node);
 	      if (group_size > 1)
 		{
 		  switch (group_size)
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 0a9fcef..591122f 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -3685,7 +3685,8 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx src)
       /* This test can fail if (for example) we want a HF and Z[v]fh is
 	 not enabled.  In that case we just want to let the standard
 	 expansion path run.  */
-      if (riscv_vector::get_vector_mode (smode, nunits).exists (&vmode))
+      if (riscv_vector::get_vector_mode (smode, nunits).exists (&vmode)
+	  && gen_lowpart_common (vmode, SUBREG_REG (src)))
 	{
 	  rtx v = gen_lowpart (vmode, SUBREG_REG (src));
 	  rtx int_reg = dest;
@@ -3958,41 +3959,6 @@ riscv_extend_cost (rtx op, bool unsigned_p)
   return COSTS_N_INSNS (2);
 }
 
-/* Return the cost of the vector binary rtx like add, minus, mult.
-   The cost of scalar2vr_cost will be appended if there one of the
-   op comes from the VEC_DUPLICATE.  */
-
-static int
-get_vector_binary_rtx_cost (rtx x, int scalar2vr_cost)
-{
-  gcc_assert (riscv_v_ext_mode_p (GET_MODE (x)));
-
-  rtx neg;
-  rtx op_0;
-  rtx op_1;
-
-  if (GET_CODE (x) == UNSPEC)
-    {
-      op_0 = XVECEXP (x, 0, 0);
-      op_1 = XVECEXP (x, 0, 1);
-    }
-  else
-    {
-      op_0 = XEXP (x, 0);
-      op_1 = XEXP (x, 1);
-    }
-
-  if (GET_CODE (op_0) == VEC_DUPLICATE
-      || GET_CODE (op_1) == VEC_DUPLICATE)
-    return (scalar2vr_cost + 1) * COSTS_N_INSNS (1);
-  else if (GET_CODE (neg = op_0) == NEG
-	   && (GET_CODE (op_1) == VEC_DUPLICATE
-	       || GET_CODE (XEXP (neg, 0)) == VEC_DUPLICATE))
-    return (scalar2vr_cost + 1) * COSTS_N_INSNS (1);
-  else
-    return COSTS_N_INSNS (1);
-}
-
 /* Implement TARGET_RTX_COSTS.  */
 
 #define SINGLE_SHIFT_COST 1
@@ -4014,73 +3980,20 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
 	{
 	case SET:
 	  {
-	    switch (GET_CODE (x))
+	    if (GET_CODE (x) == VEC_DUPLICATE)
+	      *total = (scalar2vr_cost + 1) * COSTS_N_INSNS (1);
+	    else
 	      {
-	      case VEC_DUPLICATE:
-		*total = gr2vr_cost * COSTS_N_INSNS (1);
-		break;
-	      case IF_THEN_ELSE:
-		{
-		  rtx op = XEXP (x, 1);
+		int vec_dup_count = 0;
+		subrtx_var_iterator::array_type array;
 
-		  switch (GET_CODE (op))
-		    {
-		    case DIV:
-		    case UDIV:
-		    case MOD:
-		    case UMOD:
-		    case US_PLUS:
-		    case US_MINUS:
-		    case SS_PLUS:
-		    case SS_MINUS:
-		      *total = get_vector_binary_rtx_cost (op, scalar2vr_cost);
-		      break;
-		    case UNSPEC:
-		      {
-			switch (XINT (op, 1))
-			  {
-			  case UNSPEC_VAADDU:
-			  case UNSPEC_VAADD:
-			    *total
-			      = get_vector_binary_rtx_cost (op, scalar2vr_cost);
-			    break;
-			  default:
-			    *total = COSTS_N_INSNS (1);
-			    break;
-			  }
-		      }
-		      break;
-		    default:
-		      *total = COSTS_N_INSNS (1);
-		      break;
-		    }
-		}
-		break;
-	      case PLUS:
-	      case MINUS:
-	      case AND:
-	      case IOR:
-	      case XOR:
-	      case MULT:
-	      case SMAX:
-	      case UMAX:
-	      case SMIN:
-	      case UMIN:
-		{
-		  rtx op;
-		  rtx op_0 = XEXP (x, 0);
-		  rtx op_1 = XEXP (x, 1);
+		FOR_EACH_SUBRTX_VAR (iter, array, x, ALL)
+		  if (GET_CODE (*iter) == VEC_DUPLICATE)
+		    vec_dup_count++;
 
-		  if (GET_CODE (op = op_0) == MULT
-		      || GET_CODE (op = op_1) == MULT)
-		    *total = get_vector_binary_rtx_cost (op, scalar2vr_cost);
-		  else
-		    *total = get_vector_binary_rtx_cost (x, scalar2vr_cost);
-		}
-		break;
-	      default:
-		*total = COSTS_N_INSNS (1);
-		break;
+		int total_vec_dup_cost = vec_dup_count * scalar2vr_cost;
+
+		*total = COSTS_N_INSNS (1) * (total_vec_dup_cost + 1);
 	      }
 	  }
 	  break;
@@ -5532,9 +5445,9 @@ canonicalize_comparands (rtx_code code, rtx *op0, rtx *op1)
 
   /* We might have been handed back a SUBREG.  Just to make things
      easy, force it into a REG.  */
-  if (!REG_P (*op0) && !CONST_INT_P (*op0))
+  if (!REG_P (*op0) && !CONST_INT_P (*op0) && INTEGRAL_MODE_P (GET_MODE (*op0)))
     *op0 = force_reg (word_mode, *op0);
-  if (!REG_P (*op1) && !CONST_INT_P (*op1))
+  if (!REG_P (*op1) && !CONST_INT_P (*op1) && INTEGRAL_MODE_P (GET_MODE (*op1)))
     *op1 = force_reg (word_mode, *op1);
 }
 
@@ -6213,7 +6126,8 @@ riscv_pass_vls_aggregate_in_gpr (struct riscv_arg_info *info, machine_mode mode,
    For a library call, FNTYPE is 0.  */
 
 void
-riscv_init_cumulative_args (CUMULATIVE_ARGS *cum, tree fntype, rtx, tree, int)
+riscv_init_cumulative_args (CUMULATIVE_ARGS *cum, const_tree fntype,
+			    rtx, tree, int)
 {
   memset (cum, 0, sizeof (*cum));
 
@@ -6494,30 +6408,44 @@ riscv_arg_partial_bytes (cumulative_args_t cum,
   return arg.stack_p ? arg.num_gprs * UNITS_PER_WORD : 0;
 }
 
-/* Implement FUNCTION_VALUE and LIBCALL_VALUE.  For normal calls,
-   VALTYPE is the return type and MODE is VOIDmode.  For libcalls,
-   VALTYPE is null and MODE is the mode of the return value.  */
+/* Implements hook TARGET_FUNCTION_VALUE.  */
 
 rtx
-riscv_function_value (const_tree type, const_tree func, machine_mode mode)
+riscv_function_value (const_tree ret_type, const_tree fn_decl_or_type,
+		      bool)
 {
   struct riscv_arg_info info;
   CUMULATIVE_ARGS args;
 
-  if (type)
+  if (fn_decl_or_type)
     {
-      int unsigned_p = TYPE_UNSIGNED (type);
+      const_tree fntype = TREE_CODE (fn_decl_or_type) == FUNCTION_DECL ?
+			    TREE_TYPE (fn_decl_or_type) : fn_decl_or_type;
+      riscv_init_cumulative_args (&args, fntype, NULL_RTX, NULL_TREE, 0);
+    }
+  else
+    memset (&args, 0, sizeof args);
 
-      mode = TYPE_MODE (type);
+  int unsigned_p = TYPE_UNSIGNED (ret_type);
 
-      /* Since TARGET_PROMOTE_FUNCTION_MODE unconditionally promotes,
-	 return values, promote the mode here too.  */
-      mode = promote_function_mode (type, mode, &unsigned_p, func, 1);
-    }
+  machine_mode mode = TYPE_MODE (ret_type);
 
-  memset (&args, 0, sizeof args);
+  /* Since TARGET_PROMOTE_FUNCTION_MODE unconditionally promotes,
+     return values, promote the mode here too.  */
+  mode = promote_function_mode (ret_type, mode, &unsigned_p, fn_decl_or_type, 1);
 
-  return riscv_get_arg_info (&info, &args, mode, type, true, true);
+  return riscv_get_arg_info (&info, &args, mode, ret_type, true, true);
+}
+
+/* Implements hook TARGET_LIBCALL_VALUE.  */
+
+rtx
+riscv_libcall_value (machine_mode mode, const_rtx fun ATTRIBUTE_UNUSED)
+{
+  struct riscv_arg_info info;
+  CUMULATIVE_ARGS args;
+  memset (&args, 0, sizeof args);
+  return riscv_get_arg_info (&info, &args, mode, NULL_TREE, true, true);
 }
 
 /* Implement TARGET_PASS_BY_REFERENCE. */
@@ -13867,9 +13795,9 @@ riscv_expand_xmode_usmul (rtx dest, rtx x, rtx y)
   riscv_emit_binary (MULT, mul, x, y);
 
   if (TARGET_64BIT)
-    emit_insn (gen_usmuldi3_highpart (mulhu, x, y));
+    emit_insn (gen_umuldi3_highpart (mulhu, x, y));
   else
-    emit_insn (gen_usmulsi3_highpart (mulhu, x, y));
+    emit_insn (gen_umulsi3_highpart (mulhu, x, y));
 
   riscv_emit_binary (NE, overflow_p, mulhu, CONST0_RTX (Xmode));
   riscv_emit_unary (NEG, overflow_p, overflow_p);
@@ -14037,10 +13965,13 @@ riscv_c_mode_for_floating_type (enum tree_index ti)
   return default_mode_for_floating_type (ti);
 }
 
-/* This parses the attribute arguments to target_version in DECL and modifies
-   the feature mask and priority required to select those targets.  */
+/* Parse the attribute arguments to target_version in DECL and modify
+   the feature mask and priority required to select those targets.
+   If LOC is nonnull, report diagnostics against *LOC, otherwise
+   remain silent.  */
 static void
 parse_features_for_version (tree decl,
+			    location_t *loc,
 			    struct riscv_feature_bits &res,
 			    int &priority)
 {
@@ -14071,14 +14002,12 @@ parse_features_for_version (tree decl,
   cl_target_option_restore (&global_options, &global_options_set,
 			    default_opts);
 
-  riscv_process_target_version_attr (TREE_VALUE (version_attr),
-				     DECL_SOURCE_LOCATION (decl));
+  riscv_process_target_version_attr (TREE_VALUE (version_attr), loc);
 
   priority = global_options.x_riscv_fmv_priority;
   const char *arch_string = global_options.x_riscv_arch_string;
   bool parse_res
-    = riscv_minimal_hwprobe_feature_bits (arch_string, &res,
-					  DECL_SOURCE_LOCATION (decl));
+    = riscv_minimal_hwprobe_feature_bits (arch_string, &res, loc);
   gcc_assert (parse_res);
 
   cl_target_option_restore (&global_options, &global_options_set,
@@ -14135,8 +14064,8 @@ riscv_compare_version_priority (tree decl1, tree decl2)
   struct riscv_feature_bits mask1, mask2;
   int prio1, prio2;
 
-  parse_features_for_version (decl1, mask1, prio1);
-  parse_features_for_version (decl2, mask2, prio2);
+  parse_features_for_version (decl1, nullptr, mask1, prio1);
+  parse_features_for_version (decl2, nullptr, mask2, prio2);
 
   return compare_fmv_features (mask1, mask2, prio1, prio2);
 }
@@ -14439,6 +14368,7 @@ dispatch_function_versions (tree dispatch_decl,
       version_info.version_decl = version_decl;
       // Get attribute string, parse it and find the right features.
       parse_features_for_version (version_decl,
+				  &DECL_SOURCE_LOCATION (version_decl),
 				  version_info.features,
 				  version_info.prio);
       function_versions.push_back (version_info);
@@ -15441,6 +15371,217 @@ synthesize_and (rtx operands[3])
   return true;
 }
 
+/* Synthesize OPERANDS[0] = OPERANDS[1] + OPERANDS[2].
+
+    OPERANDS[0] and OPERANDS[1] will be a REG and may be the same
+    REG.
+
+    OPERANDS[2] is a CONST_INT.
+
+    Return TRUE if the operation was fully synthesized and the caller
+    need not generate additional code.  Return FALSE if the operation
+    was not synthesized and the caller is responsible for emitting the
+    proper sequence.  */
+
+bool
+synthesize_add (rtx operands[3])
+{
+  /* Trivial cases that don't need synthesis.  */
+  if (SMALL_OPERAND (INTVAL (operands[2])))
+    return false;
+
+  int budget1 = riscv_const_insns (operands[2], true);
+  int budget2 = riscv_const_insns (GEN_INT (-INTVAL (operands[2])), true);
+
+  HOST_WIDE_INT ival = INTVAL (operands[2]);
+
+  /* If we can emit two addi insns then that's better than synthesizing
+     the constant into a temporary, then adding the temporary to the
+     other input.  The exception is when the constant can be loaded
+     in a single instruction which can issue whenever its convenient.  */
+  if (SUM_OF_TWO_S12 (ival) && budget1 >= 2)
+    {
+      HOST_WIDE_INT saturated = HOST_WIDE_INT_M1U << (IMM_BITS - 1);
+
+      if (ival >= 0)
+	saturated = ~saturated;
+
+      ival -= saturated;
+
+      rtx x = gen_rtx_PLUS (word_mode, operands[1], GEN_INT (saturated));
+      emit_insn (gen_rtx_SET (operands[0], x));
+      rtx output = gen_rtx_PLUS (word_mode, operands[0], GEN_INT (ival));
+      emit_insn (gen_rtx_SET (operands[0], output));
+      return true;
+    }
+
+  /* If we can shift the constant by 1, 2, or 3 bit positions
+     and the result is a cheaper constant, then do so.  */
+  ival = INTVAL (operands[2]);
+  if (TARGET_ZBA
+      && (((ival % 2) == 0 && budget1
+	   > riscv_const_insns (GEN_INT (ival >> 1), true))
+	   || ((ival % 4) == 0 && budget1
+	       > riscv_const_insns (GEN_INT (ival >> 2), true))
+	   || ((ival % 8) == 0 && budget1
+	       > riscv_const_insns (GEN_INT (ival >> 3), true))))
+    {
+      // Load the shifted constant into a temporary
+      int shct = ctz_hwi (ival);
+
+      /* We can handle shifting up to 3 bit positions via shNadd.  */
+      if (shct > 3)
+	shct = 3;
+
+      /* The adjusted constant may still need synthesis, so do not copy
+	 it directly into register.  Let the expander handle it.  */
+      rtx tmp = force_reg (word_mode, GEN_INT (ival >> shct));
+
+      /* Generate shift-add of temporary and operands[1]
+	 into the final destination.  */
+      rtx x = gen_rtx_ASHIFT (word_mode, tmp, GEN_INT (shct));
+      rtx output = gen_rtx_PLUS (word_mode, x, operands[1]);
+      emit_insn (gen_rtx_SET (operands[0], output));
+      return true;
+    }
+
+  /* If the negated constant is cheaper than the original, then negate
+     the constant and use sub.  */
+  if (budget2 < budget1)
+    {
+      // load -INTVAL (operands[2]) into a temporary
+      rtx tmp = force_reg (word_mode, GEN_INT (-INTVAL (operands[2])));
+
+      // subtract operads[2] from operands[1]
+      rtx output = gen_rtx_MINUS (word_mode, operands[1], tmp);
+      emit_insn (gen_rtx_SET (operands[0], output));
+      return true;
+    }
+
+  /* No add synthesis was found.  Synthesize the constant into
+     a temporary and use that.  */
+  rtx x = force_reg (word_mode, operands[2]);
+  x = gen_rtx_PLUS (word_mode, operands[1], x);
+  emit_insn (gen_rtx_SET (operands[0], x));
+  return true;
+}
+
+/*  Synthesize OPERANDS[0] = OPERANDS[1] + OPERANDS[2].
+
+    For 32-bit object cases with a 64-bit target.
+
+    OPERANDS[0] and OPERANDS[1] will be a REG and may be the same
+    REG.
+
+    OPERANDS[2] is a CONST_INT.
+
+    Return TRUE if the operation was fully synthesized and the caller
+    need not generate additional code.  Return FALSE if the operation
+    was not synthesized and the caller is responsible for emitting the
+    proper sequence.  */
+
+
+bool
+synthesize_add_extended (rtx operands[3])
+{
+
+/*  If operands[2] is a 12-bit signed immediate,
+    no synthesis needs to be done.  */
+
+  if (SMALL_OPERAND (INTVAL (operands[2])))
+    return false;
+
+  HOST_WIDE_INT ival = INTVAL (operands[2]);
+  int budget1 = riscv_const_insns (operands[2], true);
+  int budget2 = riscv_const_insns (GEN_INT (-INTVAL (operands[2])), true);
+
+/*  If operands[2] can be split into two 12-bit signed immediates,
+    split add into two adds.  */
+
+  if (SUM_OF_TWO_S12 (ival))
+    {
+      HOST_WIDE_INT saturated = HOST_WIDE_INT_M1U << (IMM_BITS - 1);
+
+      if (ival >= 0)
+	saturated = ~saturated;
+
+      ival -= saturated;
+
+      rtx temp = gen_reg_rtx (DImode);
+      emit_insn (gen_addsi3_extended (temp, operands[1], GEN_INT (saturated)));
+      temp = gen_lowpart (SImode, temp);
+      SUBREG_PROMOTED_VAR_P (temp) = 1;
+      SUBREG_PROMOTED_SET (temp, SRP_SIGNED);
+      emit_insn (gen_rtx_SET (operands[0], temp));
+      rtx t = gen_reg_rtx (DImode);
+      emit_insn (gen_addsi3_extended (t, operands[0], GEN_INT (ival)));
+      t = gen_lowpart (SImode, t);
+      SUBREG_PROMOTED_VAR_P (t) = 1;
+      SUBREG_PROMOTED_SET (t, SRP_SIGNED);
+      emit_move_insn (operands[0], t);
+      return true;
+    }
+
+
+/*  If the negated value is cheaper to synthesize, subtract that from
+    operands[1]. */
+
+  if (budget2 < budget1)
+    {
+      rtx tmp = gen_reg_rtx (SImode);
+      emit_insn (gen_rtx_SET (tmp, GEN_INT (-INTVAL (operands[2]))));
+
+      rtx t = gen_reg_rtx (DImode);
+      emit_insn (gen_subsi3_extended (t, operands[1], tmp));
+      t = gen_lowpart (SImode, t);
+      SUBREG_PROMOTED_VAR_P (t) = 1;
+      SUBREG_PROMOTED_SET (t, SRP_SIGNED);
+      emit_move_insn (operands[0], t);
+      return true;
+    }
+
+  rtx tsrc = force_reg (SImode, operands[2]);
+  rtx tdest = gen_reg_rtx (DImode);
+  emit_insn (gen_addsi3_extended (tdest, operands[1], tsrc));
+  tdest = gen_lowpart (SImode, tdest);
+  SUBREG_PROMOTED_VAR_P (tdest) = 1;
+  SUBREG_PROMOTED_SET (tdest, SRP_SIGNED);
+  emit_move_insn (operands[0], tdest);
+  return true;
+
+}
+
+
+/*
+    HINT : argument specify the target cache
+
+    TODO : LOCALITY is unused.
+
+    Return the first operand of the associated PREF or PREFX insn.  */
+rtx
+riscv_prefetch_cookie (rtx hint, rtx locality)
+{
+  return (GEN_INT (INTVAL (hint)
+		   + CacheHint::DCACHE_HINT + INTVAL (locality) * 0));
+}
+
+/* Return true if X is a legitimate address with offset for prefetch.
+   MODE is the mode of the value being accessed.  */
+bool
+riscv_prefetch_offset_address_p (rtx x, machine_mode mode)
+{
+  struct riscv_address_info addr;
+
+  if (riscv_classify_address (&addr, x, mode, false)
+      && addr.type == ADDRESS_REG)
+    {
+      if (TARGET_XMIPSCBOP)
+	return (CONST_INT_P (addr.offset)
+		&& MIPS_RISCV_9BIT_OFFSET_P (INTVAL (addr.offset)));
+    }
+
+  return true;
+}
 
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
@@ -15804,6 +15945,12 @@ synthesize_and (rtx operands[3])
 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P riscv_vector_mode_supported_any_target_p
 
+#undef TARGET_FUNCTION_VALUE
+#define TARGET_FUNCTION_VALUE riscv_function_value
+
+#undef TARGET_LIBCALL_VALUE
+#define TARGET_LIBCALL_VALUE riscv_libcall_value
+
 #undef TARGET_FUNCTION_VALUE_REGNO_P
 #define TARGET_FUNCTION_VALUE_REGNO_P riscv_function_value_regno_p
 
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 45fa521..9146571 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -71,7 +71,7 @@ extern const char *riscv_arch_help (int argc, const char **argv);
   {"tune", "%{!mtune=*:"						\
 	   "  %{!mcpu=*:-mtune=%(VALUE)}"				\
 	   "  %{mcpu=*:-mtune=%:riscv_default_mtune(%* %(VALUE))}}" },	\
-  {"arch", "%{!march=*:"						\
+  {"arch", "%{!march=*|march=unset:"					\
 	   "  %{!mcpu=*:-march=%(VALUE)}"				\
 	   "  %{mcpu=*:%:riscv_expand_arch_from_cpu(%* %(VALUE))}}" },	\
   {"abi", "%{!mabi=*:-mabi=%(VALUE)}" },				\
@@ -111,13 +111,19 @@ extern const char *riscv_arch_help (int argc, const char **argv);
 %(subtarget_asm_spec)" \
 ASM_MISA_SPEC
 
+/* Drop all -march=* options before -march=unset.  */
+#define ARCH_UNSET_CLEANUP_SPECS  \
+  "%{march=unset:%<march=*} "  \
+
 #undef DRIVER_SELF_SPECS
 #define DRIVER_SELF_SPECS					\
+ARCH_UNSET_CLEANUP_SPECS \
 "%{march=help:%:riscv_arch_help()} "				\
 "%{print-supported-extensions:%:riscv_arch_help()} "		\
 "%{-print-supported-extensions:%:riscv_arch_help()} "		\
 "%{march=*:%:riscv_expand_arch(%*)} "				\
-"%{!march=*:%{mcpu=*:%:riscv_expand_arch_from_cpu(%*)}} "
+"%{!march=*|march=unset:%{mcpu=*:%:riscv_expand_arch_from_cpu(%*)}} " \
+"%{march=unset:%{!mcpu=*:%eAt least one valid -mcpu option must be given after -march=unset}} "
 
 #define LOCAL_LABEL_PREFIX	"."
 #define USER_LABEL_PREFIX	""
@@ -759,12 +765,6 @@ enum reg_class
 
 #define CALLEE_SAVED_FREG_NUMBER(REGNO) CALLEE_SAVED_REG_NUMBER (REGNO - 32)
 
-#define LIBCALL_VALUE(MODE) \
-  riscv_function_value (NULL_TREE, NULL_TREE, MODE)
-
-#define FUNCTION_VALUE(VALTYPE, FUNC) \
-  riscv_function_value (VALTYPE, FUNC, VOIDmode)
-
 /* 1 if N is a possible register number for function argument passing.
    We have no FP argument registers when soft-float.  */
 
@@ -1319,4 +1319,15 @@ extern void riscv_remove_unneeded_save_restore_calls (void);
 
 #define TARGET_HAS_FMV_TARGET_ATTRIBUTE 0
 
+/* mips pref valid offset range.  */
+#define MIPS_RISCV_9BIT_OFFSET_P(OFFSET) (IN_RANGE (OFFSET, 0, 511))
+
+/* mips pref cache hint type.  */
+typedef enum {
+    ICACHE_HINT = 0 << 3,
+    DCACHE_HINT = 1 << 3,
+    SCACHE_HINT = 2 << 3,
+    TCACHE_HINT = 3 << 3
+} CacheHint;
+
 #endif /* ! GCC_RISCV_H */
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 578dd43..d34405c 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -712,24 +712,45 @@
    (set_attr "mode" "SI")])
 
 (define_expand "addsi3"
-  [(set (match_operand:SI          0 "register_operand" "=r,r")
-	(plus:SI (match_operand:SI 1 "register_operand" " r,r")
-		 (match_operand:SI 2 "arith_operand"    " r,I")))]
+  [(set (match_operand:SI          0 "register_operand")
+	(plus:SI (match_operand:SI 1 "register_operand")
+		 (match_operand:SI 2 "reg_or_const_int_operand")))]
   ""
 {
+  /* We may be able to find a faster sequence, if so, then we are
+     done.  Otherwise let expansion continue normally.  */
+  if (CONST_INT_P (operands[2])
+      && ((!TARGET_64BIT && synthesize_add (operands))
+	  || (TARGET_64BIT && synthesize_add_extended (operands))))
+    DONE;
+
+  /* Constants have already been handled already.  */
   if (TARGET_64BIT)
     {
-      rtx t = gen_reg_rtx (DImode);
-      emit_insn (gen_addsi3_extended (t, operands[1], operands[2]));
-      t = gen_lowpart (SImode, t);
-      SUBREG_PROMOTED_VAR_P (t) = 1;
-      SUBREG_PROMOTED_SET (t, SRP_SIGNED);
-      emit_move_insn (operands[0], t);
+      rtx tdest = gen_reg_rtx (DImode);
+      emit_insn (gen_addsi3_extended (tdest, operands[1], operands[2]));
+      tdest = gen_lowpart (SImode, tdest);
+      SUBREG_PROMOTED_VAR_P (tdest) = 1;
+      SUBREG_PROMOTED_SET (tdest, SRP_SIGNED);
+      emit_move_insn (operands[0], tdest);
       DONE;
     }
+
 })
 
-(define_insn "adddi3"
+(define_expand "adddi3"
+  [(set (match_operand:DI          0 "register_operand")
+	(plus:DI (match_operand:DI 1 "register_operand")
+		 (match_operand:DI 2 "reg_or_const_int_operand")))]
+  "TARGET_64BIT"
+{
+  /* We may be able to find a faster sequence, if so, then we are
+     done.  Otherwise let expansion continue normally.  */
+  if (CONST_INT_P (operands[2]) && synthesize_add (operands))
+    DONE;
+})
+
+(define_insn "*adddi3"
   [(set (match_operand:DI          0 "register_operand" "=r,r")
 	(plus:DI (match_operand:DI 1 "register_operand" " r,r")
 		 (match_operand:DI 2 "arith_operand"    " r,I")))]
@@ -2293,12 +2314,16 @@
       rtx abs_reg = gen_reg_rtx (<ANYF:MODE>mode);
       rtx coeff_reg = gen_reg_rtx (<ANYF:MODE>mode);
       rtx tmp_reg = gen_reg_rtx (<ANYF:MODE>mode);
+      rtx fflags = gen_reg_rtx (SImode);
 
       riscv_emit_move (tmp_reg, operands[1]);
       riscv_emit_move (coeff_reg,
 		       riscv_vector::get_fp_rounding_coefficient (<ANYF:MODE>mode));
       emit_insn (gen_abs<ANYF:mode>2 (abs_reg, operands[1]));
 
+      /* fp compare can set invalid flag for NaN, so backup fflags.  */
+      if (flag_trapping_math)
+        emit_insn (gen_riscv_frflags (fflags));
       riscv_expand_conditional_branch (label, LT, abs_reg, coeff_reg);
 
       emit_jump_insn (gen_jump (end_label));
@@ -2324,6 +2349,14 @@
       emit_insn (gen_copysign<ANYF:mode>3 (tmp_reg, abs_reg, operands[1]));
 
       emit_label (end_label);
+
+      /* Restore fflags, but after label.  This is slightly different
+         than glibc implementation which only needs to restore under
+         the label, since it checks for NaN first, meaning following fp
+         compare can't raise fp exceptons and thus not clobber fflags.  */
+      if (flag_trapping_math)
+        emit_insn (gen_riscv_fsflags (fflags));
+
       riscv_emit_move (operands[0], tmp_reg);
     }
 
@@ -4402,11 +4435,21 @@
 )
 
 (define_insn "prefetch"
-  [(prefetch (match_operand 0 "prefetch_operand" "Qr")
-             (match_operand 1 "imm5_operand" "i")
-             (match_operand 2 "const_int_operand" "n"))]
-  "TARGET_ZICBOP"
+  [(prefetch (match_operand 0 "prefetch_operand" "Qr,ZD")
+	     (match_operand 1 "imm5_operand" "i,i")
+	     (match_operand 2 "const_int_operand" "n,n"))]
+  "TARGET_ZICBOP || TARGET_XMIPSCBOP"
 {
+  if (TARGET_XMIPSCBOP)
+    {
+      /* Mips Prefetch write is nop for p8700.  */
+      if (operands[1] != CONST0_RTX (GET_MODE (operands[1])))
+	return "nop";
+
+      operands[1] = riscv_prefetch_cookie (operands[1], operands[2]);
+      return "mips.pref\t%1,%a0";
+    }
+
   switch (INTVAL (operands[1]))
   {
     case 0:
diff --git a/gcc/config/riscv/sifive-p400.md b/gcc/config/riscv/sifive-p400.md
index ed8b8ec..0acdbda 100644
--- a/gcc/config/riscv/sifive-p400.md
+++ b/gcc/config/riscv/sifive-p400.md
@@ -153,10 +153,13 @@
        (eq_attr "type" "fmove,fcvt"))
   "p400_float_pipe,sifive_p400_fpu")
 
+;; We need something for HF so that we don't abort during
+;; scheduling if someone was to ask for p400 scheduling, but
+;; enable the various HF mode extensions.
 (define_insn_reservation "sifive_p400_fdiv_s" 18
   (and (eq_attr "tune" "sifive_p400")
        (eq_attr "type" "fdiv,fsqrt")
-       (eq_attr "mode" "SF"))
+       (eq_attr "mode" "HF,SF"))
   "sifive_p400_FM, sifive_p400_fdiv*5")
 
 (define_insn_reservation "sifive_p400_fdiv_d" 31
@@ -178,3 +181,18 @@
 (define_bypass 1 "sifive_p400_f2i"
   "sifive_p400_branch,sifive_p400_sfb_alu,sifive_p400_mul,
    sifive_p400_div,sifive_p400_alu,sifive_p400_cpop")
+
+
+;; Someone familiar with the p400 uarch needs to put
+;; these into the right reservations.  This is just a placeholder
+;; for everything I found that had no mapping to a reservation.
+;;
+;; Note that even if the processor does not implementat a particular
+;; instruction it should still have suitable reservations, even if
+;; they are just dummies like this one.
+(define_insn_reservation "sifive_p400_unknown" 1
+  (and (eq_attr "tune" "sifive_p400")
+       (eq_attr "type" "ghost,vfrecp,vclmul,vldm,vmffs,vclmulh,vlsegde,vfcvtitof,vsm4k,vfcvtftoi,vfdiv,vsm3c,vsm4r,viwmuladd,vfwredu,vcpop,vfwmuladd,vstux,vsshift,vfwcvtftof,vfncvtftof,vfwmaccbf16,vext,vssegte,rdvl,vaeskf1,vfslide1up,vmov,vimovvx,vaesef,vfsqrt,viminmax,vfwcvtftoi,vssegtox,vfclass,viwmul,vector,vgmul,vsm3me,vfcmp,vstm,vfredo,vfwmul,vaeskf2,vstox,vfncvtbf16,vislide1up,vgather,vldox,viwred,vctz,vghsh,vsts,vslidedown,vfmerge,vicmp,vsmul,vlsegdff,vfalu,vfmov,vislide1down,vfminmax,vcompress,vldr,vldff,vlsegdux,vimuladd,vsalu,vidiv,sf_vqmacc,vfslide1down,vaesem,vimerge,vfncvtftoi,vfwcvtitof,vicalu,vaesz,sf_vc_se,vsha2cl,vmsfs,vldux,vmidx,vslideup,vired,vlde,vfwredo,vfmovfv,vbrev,vfncvtitof,rdfrm,vsetvl,vssegts,vimul,vialu,vbrev8,vfwalu,rdvlenb,sf_vfnrclip,vclz,vnclip,sf_vc,vimov,vste,vfmuladd,vfmovvf,vwsll,vsetvl_pre,vlds,vlsegds,vmiota,vmalu,wrvxrm,wrfrm,viwalu,vaesdm,vssegtux,vaesdf,vimovxv,vror,vnshift,vstr,vaalu,vsha2ms,crypto,vfwcvtbf16,vlsegdox,vrol,vandn,vfsgnj,vmpop,vfredu,vsha2ch,vshift,vrev8,vfmul"))
+  "p400_int_pipe+sifive_p400_ialu")
+
+
diff --git a/gcc/config/riscv/sifive-p600.md b/gcc/config/riscv/sifive-p600.md
index 2401349..ccd006d 100644
--- a/gcc/config/riscv/sifive-p600.md
+++ b/gcc/config/riscv/sifive-p600.md
@@ -157,10 +157,13 @@
        (eq_attr "type" "fmove,fcvt"))
   "float_pipe,sifive_p600_fpu")
 
+;; We need something for HF so that we don't abort during
+;; scheduling if someone was to ask for p600 scheduling, but
+;; enable the various HF mode extensions.
 (define_insn_reservation "sifive_p600_fdiv_s" 11
   (and (eq_attr "tune" "sifive_p600")
        (eq_attr "type" "fdiv,fsqrt")
-       (eq_attr "mode" "SF"))
+       (eq_attr "mode" "HF,SF"))
   "sifive_p600_FM, sifive_p600_fdiv*5")
 
 (define_insn_reservation "sifive_p600_fdiv_d" 19
@@ -182,3 +185,15 @@
 (define_bypass 1 "sifive_p600_f2i"
   "sifive_p600_branch,sifive_p600_sfb_alu,sifive_p600_mul,
    sifive_p600_div,sifive_p600_alu,sifive_p600_cpop")
+
+;; Someone familiar with the p600 uarch needs to put
+;; these into the right reservations.  This is just a placeholder
+;; for everything I found that had no mapping to a reservation.
+;;
+;; Note that even if the processor does not implementat a particular
+;; instruction it should still have suitable reservations, even if
+;; they are just dummies like this one.
+(define_insn_reservation "sifive_p600_unknown" 1
+  (and (eq_attr "tune" "sifive_p600")
+       (eq_attr "type" "vicmp,vssegte,vbrev8,vfwalu,vimov,vmpop,vaesdf,vislide1up,vror,vsha2cl,vrol,vslideup,vimuladd,vclmul,vaesef,vext,vlsegdff,vfmuladd,vfclass,vmsfs,vfcmp,vsmul,vsm3me,vmalu,vshift,viwmuladd,vfslide1up,vlsegde,vsm4k,wrvxrm,vislide1down,vsm3c,vfwmuladd,vaesdm,vclmulh,vfwcvtftof,vfwredu,vfredo,sf_vfnrclip,vaesz,vwsll,vmiota,vctz,vsetvl_pre,vstm,vidiv,vssegtux,vfwmul,vcompress,vste,vired,vlsegds,vaesem,vfminmax,ghost,vandn,crypto,vfmul,vialu,vfmovvf,rdfrm,vldff,vfmerge,vsshift,vnclip,sf_vqmacc,vnshift,vfdiv,vfslide1down,vfncvtitof,vfsqrt,vimovxv,vstr,vfwcvtbf16,vfwcvtitof,vbrev,vssegtox,vssegts,vcpop,vmffs,viwmul,vldr,vmidx,rdvlenb,vfalu,vslidedown,vlde,vfsgnj,vfmov,viwalu,vsha2ch,vfncvtbf16,vfcvtitof,rdvl,vsetvl,vsha2ms,vector,vstux,vimerge,vclz,sf_vc,vfcvtftoi,viminmax,vsm4r,sf_vc_se,wrfrm,vstox,vfmovfv,vfncvtftoi,vimul,vsalu,vmov,vgmul,vgather,vldux,vlsegdox,vfncvtftof,vimovvx,vghsh,vldm,vldox,vfwcvtftoi,vlds,vfrecp,vaeskf2,vsts,vfredu,vicalu,vaalu,vfwmaccbf16,vrev8,vfwredo,vlsegdux,viwred,vaeskf1"))
+  "int_pipe+sifive_p600_ialu")
diff --git a/gcc/config/riscv/sync.md b/gcc/config/riscv/sync.md
index 50ec8b3..e47bb41 100644
--- a/gcc/config/riscv/sync.md
+++ b/gcc/config/riscv/sync.md
@@ -386,13 +386,13 @@
   })
 
 (define_insn "amo_atomic_exchange<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=&r")
+  [(set (match_operand:GPR 0 "register_operand" "=r")
 	(unspec_volatile:GPR
 	  [(match_operand:GPR 1 "memory_operand" "+A")
 	   (match_operand:SI 3 "const_int_operand")] ;; model
 	  UNSPEC_SYNC_EXCHANGE))
    (set (match_dup 1)
-	(match_operand:GPR 2 "register_operand" "0"))]
+	(match_operand:GPR 2 "reg_or_0_operand" "rJ"))]
   "TARGET_ZAAMO"
   "amoswap.<amo>%A3\t%0,%z2,%1"
   [(set_attr "type" "atomic")
@@ -434,13 +434,13 @@
 })
 
 (define_insn "zabha_atomic_exchange<mode>"
-  [(set (match_operand:SHORT 0 "register_operand" "=&r")
+  [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(unspec_volatile:SHORT
 	  [(match_operand:SHORT 1 "memory_operand" "+A")
 	   (match_operand:SI 3 "const_int_operand")] ;; model
 	  UNSPEC_SYNC_EXCHANGE_ZABHA))
    (set (match_dup 1)
-	(match_operand:SHORT 2 "register_operand" "0"))]
+	(match_operand:SHORT 2 "reg_or_0_operand" "rJ"))]
   "TARGET_ZABHA"
   "amoswap.<amobh>%A3\t%0,%z2,%1"
   [(set_attr "type" "atomic")
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index 7aac56a..a7eaa8b 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -229,8 +229,41 @@ s-riscv-ext.texi: build/gen-riscv-ext-texi$(build_exeext)
 	$(SHELL) $(srcdir)/../move-if-change tmp-riscv-ext.texi $(srcdir)/doc/riscv-ext.texi
 	$(STAMP) s-riscv-ext.texi
 
-# Run `riscv-regen' after you changed or added anything from riscv-ext*.def
+RISCV_CORES_DEFS = \
+  $(srcdir)/config/riscv/riscv-cores.def
+
+build/gen-riscv-mtune-texi.o: $(srcdir)/config/riscv/gen-riscv-mtune-texi.cc \
+	$(RISCV_CORES_DEFS)
+	$(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@
+
+build/gen-riscv-mcpu-texi.o: $(srcdir)/config/riscv/gen-riscv-mcpu-texi.cc \
+	$(RISCV_CORES_DEFS)
+	$(CXX_FOR_BUILD) $(CXXFLAGS_FOR_BUILD) -c $< -o $@
+
+build/gen-riscv-mtune-texi$(build_exeext): build/gen-riscv-mtune-texi.o
+	$(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $<
+
+build/gen-riscv-mcpu-texi$(build_exeext): build/gen-riscv-mcpu-texi.o
+	$(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ $<
+
+$(srcdir)/doc/riscv-mtune.texi: $(RISCV_CORES_DEFS)
+$(srcdir)/doc/riscv-mtune.texi: s-riscv-mtune.texi ; @true
+
+$(srcdir)/doc/riscv-mcpu.texi: $(RISCV_CORES_DEFS)
+$(srcdir)/doc/riscv-mcpu.texi: s-riscv-mcpu.texi ; @true
+
+s-riscv-mtune.texi: build/gen-riscv-mtune-texi$(build_exeext)
+	$(RUN_GEN) build/gen-riscv-mtune-texi$(build_exeext) > tmp-riscv-mtune.texi
+	$(SHELL) $(srcdir)/../move-if-change tmp-riscv-mtune.texi $(srcdir)/doc/riscv-mtune.texi
+	$(STAMP) s-riscv-mtune.texi
+
+s-riscv-mcpu.texi: build/gen-riscv-mcpu-texi$(build_exeext)
+	$(RUN_GEN) build/gen-riscv-mcpu-texi$(build_exeext) > tmp-riscv-mcpu.texi
+	$(SHELL) $(srcdir)/../move-if-change tmp-riscv-mcpu.texi $(srcdir)/doc/riscv-mcpu.texi
+	$(STAMP) s-riscv-mcpu.texi
+
+# Run `riscv-regen' after you changed or added anything from riscv-ext*.def and riscv-cores*.def
 
 .PHONY: riscv-regen
 
-riscv-regen: s-riscv-ext.texi s-riscv-ext.opt
+riscv-regen: s-riscv-ext.texi s-riscv-ext.opt s-riscv-mtune.texi s-riscv-mcpu.texi
diff --git a/gcc/config/riscv/t-rtems b/gcc/config/riscv/t-rtems
index f596e76..a4d2d03 100644
--- a/gcc/config/riscv/t-rtems
+++ b/gcc/config/riscv/t-rtems
@@ -1,8 +1,8 @@
 MULTILIB_OPTIONS	=
 MULTILIB_DIRNAMES	=
 
-MULTILIB_OPTIONS	+= march=rv32i/march=rv32iac/march=rv32im/march=rv32imf/march=rv32ima/march=rv32imac/march=rv32imaf/march=rv32imafc/march=rv32imafd/march=rv32imafdc/march=rv64ima/march=rv64imac/march=rv64imafd/march=rv64imafdc
-MULTILIB_DIRNAMES	+= rv32i       rv32iac       rv32im       rv32imf      rv32ima       rv32imac       rv32imaf       rv32imafc       rv32imafd       rv32imafdc       rv64ima       rv64imac       rv64imafd       rv64imafdc
+MULTILIB_OPTIONS	+= march=rv32i/march=rv32iac/march=rv32im/march=rv32imf/march=rv32ima/march=rv32imac/march=rv32imaf/march=rv32imafc/march=rv32imafd/march=rv32imafdc/march=rv64ima/march=rv64imac/march=rv64imafd/march=rv64imafdc/march=rv64imc
+MULTILIB_DIRNAMES	+= rv32i       rv32iac       rv32im       rv32imf       rv32ima       rv32imac       rv32imaf       rv32imafc       rv32imafd       rv32imafdc       rv64ima       rv64imac       rv64imafd       rv64imafdc       rv64imc
 
 MULTILIB_OPTIONS	+= mabi=ilp32/mabi=ilp32f/mabi=ilp32d/mabi=lp64/mabi=lp64d
 MULTILIB_DIRNAMES	+= ilp32      ilp32f      ilp32d      lp64      lp64d
@@ -10,6 +10,9 @@ MULTILIB_DIRNAMES	+= ilp32      ilp32f      ilp32d      lp64      lp64d
 MULTILIB_OPTIONS	+= mcmodel=medany
 MULTILIB_DIRNAMES	+= medany
 
+MULTILIB_OPTIONS	+= mstrict-align
+MULTILIB_DIRNAMES	+= strict-align
+
 MULTILIB_REQUIRED	=
 MULTILIB_REQUIRED	+= march=rv32i/mabi=ilp32
 MULTILIB_REQUIRED	+= march=rv32iac/mabi=ilp32
@@ -25,3 +28,5 @@ MULTILIB_REQUIRED	+= march=rv64ima/mabi=lp64/mcmodel=medany
 MULTILIB_REQUIRED	+= march=rv64imac/mabi=lp64/mcmodel=medany
 MULTILIB_REQUIRED	+= march=rv64imafd/mabi=lp64d/mcmodel=medany
 MULTILIB_REQUIRED	+= march=rv64imafdc/mabi=lp64d/mcmodel=medany
+MULTILIB_REQUIRED	+= march=rv64imafdc/mabi=lp64d/mcmodel=medany/mstrict-align
+MULTILIB_REQUIRED	+= march=rv64imc/mabi=lp64/mcmodel=medany/mstrict-align
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 66b7670..2b35d66 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1398,6 +1398,7 @@
   }
   [(set_attr "type" "vmov,vlde,vste")
    (set_attr "mode" "<VT:MODE>")
+   (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))])
 
@@ -1435,6 +1436,7 @@
   }
   [(set_attr "type" "vlde,vste,vmov")
    (set_attr "mode" "<MODE>")
+   (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))]
 )
@@ -1485,6 +1487,7 @@
 }
   [(set_attr "type" "vlde,vste,vmov")
    (set_attr "mode" "<VLS_AVL_REG:MODE>")
+   (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))]
 )
@@ -5490,6 +5493,98 @@
   "TARGET_VECTOR"
 {})
 
+(define_expand "@pred_mul_plus_vx_<mode>"
+  [(set (match_operand:V_VLSI_QHS       0 "register_operand")
+	(if_then_else:V_VLSI_QHS
+	  (unspec:<VM>
+	    [(match_operand:<VM>        1 "vector_mask_operand")
+	     (match_operand             6 "vector_length_operand")
+	     (match_operand             7 "const_int_operand")
+	     (match_operand             8 "const_int_operand")
+	     (match_operand             9 "const_int_operand")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (plus:V_VLSI_QHS
+	    (mult:V_VLSI_QHS
+	      (vec_duplicate:V_VLSI_QHS
+	        (match_operand:<VEL>    2 "register_operand"))
+	      (match_operand:V_VLSI_QHS 3 "register_operand"))
+	    (match_operand:V_VLSI_QHS   4 "register_operand"))
+	  (match_operand:V_VLSI_QHS     5 "vector_merge_operand")))]
+  "TARGET_VECTOR"
+{
+  riscv_vector::prepare_ternary_operands (operands);
+})
+
+(define_expand "@pred_mul_plus_vx_<mode>"
+  [(set (match_operand:V_VLSI_D       0 "register_operand")
+	(if_then_else:V_VLSI_D
+	  (unspec:<VM>
+	    [(match_operand:<VM>      1 "vector_mask_operand")
+	     (match_operand           6 "vector_length_operand")
+	     (match_operand           7 "const_int_operand")
+	     (match_operand           8 "const_int_operand")
+	     (match_operand           9 "const_int_operand")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (plus:V_VLSI_D
+	    (mult:V_VLSI_D
+	      (vec_duplicate:V_VLSI_D
+		(match_operand:<VEL>  2 "register_operand"))
+	      (match_operand:V_VLSI_D 3 "register_operand"))
+	    (match_operand:V_VLSI_D   4 "register_operand"))
+	  (match_operand:V_VLSI_D     5 "vector_merge_operand")))]
+  "TARGET_VECTOR && TARGET_64BIT"
+{
+  riscv_vector::prepare_ternary_operands (operands);
+})
+
+(define_expand "@pred_vnmsac_vx_<mode>"
+  [(set (match_operand:V_VLSI_QHS       0 "register_operand")
+	(if_then_else:V_VLSI_QHS
+	  (unspec:<VM>
+	    [(match_operand:<VM>        1 "vector_mask_operand")
+	     (match_operand             6 "vector_length_operand")
+	     (match_operand             7 "const_int_operand")
+	     (match_operand             8 "const_int_operand")
+	     (match_operand             9 "const_int_operand")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (minus:V_VLSI_QHS
+	    (match_operand:V_VLSI_QHS   4 "register_operand")
+	    (mult:V_VLSI_QHS
+	      (vec_duplicate:V_VLSI_QHS
+	        (match_operand:<VEL>    2 "register_operand"))
+	      (match_operand:V_VLSI_QHS 3 "register_operand")))
+	  (match_operand:V_VLSI_QHS     5 "vector_merge_operand")))]
+  "TARGET_VECTOR"
+{
+  riscv_vector::prepare_ternary_operands (operands);
+})
+
+(define_expand "@pred_vnmsac_vx_<mode>"
+  [(set (match_operand:V_VLSI_D       0 "register_operand")
+	(if_then_else:V_VLSI_D
+	  (unspec:<VM>
+	    [(match_operand:<VM>      1 "vector_mask_operand")
+	     (match_operand           6 "vector_length_operand")
+	     (match_operand           7 "const_int_operand")
+	     (match_operand           8 "const_int_operand")
+	     (match_operand           9 "const_int_operand")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (minus:V_VLSI_D
+	    (match_operand:V_VLSI_D   4 "register_operand")
+	    (mult:V_VLSI_D
+	      (vec_duplicate:V_VLSI_D
+		(match_operand:<VEL>  2 "register_operand"))
+	      (match_operand:V_VLSI_D 3 "register_operand")))
+	  (match_operand:V_VLSI_D     5 "vector_merge_operand")))]
+  "TARGET_VECTOR && TARGET_64BIT"
+{
+  riscv_vector::prepare_ternary_operands (operands);
+})
+
 (define_insn "*pred_madd<mode>_scalar"
   [(set (match_operand:V_VLSI 0 "register_operand"            "=vd, vr")
 	(if_then_else:V_VLSI
@@ -6324,8 +6419,8 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "@pred_<optab><mode>_scalar"
-  [(set (match_operand:VF 0 "register_operand"           "=vd, vd, vr, vr")
-	(if_then_else:VF
+  [(set (match_operand:V_VLSF 0 "register_operand"           "=vd, vd, vr, vr")
+	(if_then_else:V_VLSF
 	  (unspec:<VM>
 	    [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
 	     (match_operand 5 "vector_length_operand"    "rvl,rvl,rvl,rvl")
@@ -6336,11 +6431,11 @@
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)
 	     (reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE)
-	  (commutative_float_binop:VF
-	    (vec_duplicate:VF
+	  (commutative_float_binop:V_VLSF
+	    (vec_duplicate:V_VLSF
 	      (match_operand:<VEL> 4 "register_operand"  "  f,  f,  f,  f"))
-	    (match_operand:VF 3 "register_operand"       " vr, vr, vr, vr"))
-	  (match_operand:VF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
+	    (match_operand:V_VLSF 3 "register_operand"       " vr, vr, vr, vr"))
+	  (match_operand:V_VLSF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
   "TARGET_VECTOR"
   "vf<insn>.vf\t%0,%3,%4%p1"
   [(set_attr "type" "<float_insn_type>")
@@ -6349,43 +6444,43 @@
 	(symbol_ref "riscv_vector::get_frm_mode (operands[9])"))])
 
 (define_insn "@pred_<optab><mode>_scalar"
-  [(set (match_operand:VF 0 "register_operand"           "=vd, vd, vr, vr")
-	(if_then_else:VF
+  [(set (match_operand:V_VLSF 0 "register_operand"           "=vd, vd, vr, vr")
+	(if_then_else:V_VLSF
 	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
-	     (match_operand 5 "vector_length_operand"    "rvl,rvl,rvl,rvl")
-	     (match_operand 6 "const_int_operand"        "  i,  i,  i,  i")
-	     (match_operand 7 "const_int_operand"        "  i,  i,  i,  i")
-	     (match_operand 8 "const_int_operand"        "  i,  i,  i,  i")
+	    [(match_operand:<VM> 1 "vector_mask_operand"     " vm, vm,Wc1,Wc1")
+	     (match_operand 5 "vector_length_operand"        "rvl,rvl,rvl,rvl")
+	     (match_operand 6 "const_int_operand"            "  i,  i,  i,  i")
+	     (match_operand 7 "const_int_operand"            "  i,  i,  i,  i")
+	     (match_operand 8 "const_int_operand"            "  i,  i,  i,  i")
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (commutative_float_binop_nofrm:VF
-	    (vec_duplicate:VF
-	      (match_operand:<VEL> 4 "register_operand"  "  f,  f,  f,  f"))
-	    (match_operand:VF 3 "register_operand"       " vr, vr, vr, vr"))
-	  (match_operand:VF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
+	  (commutative_float_binop_nofrm:V_VLSF
+	    (vec_duplicate:V_VLSF
+	      (match_operand:<VEL> 4 "register_operand"      "  f,  f,  f,  f"))
+	    (match_operand:V_VLSF 3 "register_operand"       " vr, vr, vr, vr"))
+	  (match_operand:V_VLSF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
   "TARGET_VECTOR"
   "vf<insn>.vf\t%0,%3,%4%p1"
   [(set_attr "type" "<float_insn_type>")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "@pred_<ieee_fmaxmin_op><mode>_scalar"
-  [(set (match_operand:VF 0 "register_operand"           "=vd, vd, vr, vr")
-	(if_then_else:VF
+  [(set (match_operand:V_VLSF 0 "register_operand"         "=vd, vd, vr, vr")
+	(if_then_else:V_VLSF
 	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
-	     (match_operand 5 "vector_length_operand"    "rvl,rvl,rvl,rvl")
-	     (match_operand 6 "const_int_operand"        "  i,  i,  i,  i")
-	     (match_operand 7 "const_int_operand"        "  i,  i,  i,  i")
-	     (match_operand 8 "const_int_operand"        "  i,  i,  i,  i")
+	    [(match_operand:<VM> 1 "vector_mask_operand"   " vm, vm,Wc1,Wc1")
+	     (match_operand 5 "vector_length_operand"      "rvl,rvl,rvl,rvl")
+	     (match_operand 6 "const_int_operand"          "  i,  i,  i,  i")
+	     (match_operand 7 "const_int_operand"          "  i,  i,  i,  i")
+	     (match_operand 8 "const_int_operand"          "  i,  i,  i,  i")
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
-	  (unspec:VF
-	    [(match_operand:VF 3 "register_operand"        " vr, vr, vr, vr")
-	      (vec_duplicate:VF
+	  (unspec:V_VLSF
+	    [(match_operand:V_VLSF 3 "register_operand"    " vr, vr, vr, vr")
+	      (vec_duplicate:V_VLSF
 		(match_operand:<VEL> 4 "register_operand"  "  f,  f,  f,  f"))]
 	      UNSPEC_VFMAXMIN)
-	  (match_operand:VF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
+	  (match_operand:V_VLSF 2 "vector_merge_operand"   " vu,  0, vu,  0")))]
   "TARGET_VECTOR"
   "v<ieee_fmaxmin_op>.vf\t%0,%3,%4%p1"
   [(set_attr "type" "vfminmax")
@@ -6417,8 +6512,8 @@
 	(symbol_ref "riscv_vector::get_frm_mode (operands[9])"))])
 
 (define_insn "@pred_<optab><mode>_reverse_scalar"
-  [(set (match_operand:VF 0 "register_operand"           "=vd, vd, vr, vr")
-	(if_then_else:VF
+  [(set (match_operand:V_VLSF 0 "register_operand"       "=vd, vd, vr, vr")
+	(if_then_else:V_VLSF
 	  (unspec:<VM>
 	    [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1")
 	     (match_operand 5 "vector_length_operand"    "rvl,rvl,rvl,rvl")
@@ -6429,11 +6524,11 @@
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)
 	     (reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE)
-	  (non_commutative_float_binop:VF
-	    (vec_duplicate:VF
+	  (non_commutative_float_binop:V_VLSF
+	    (vec_duplicate:V_VLSF
 	      (match_operand:<VEL> 4 "register_operand"  "  f,  f,  f,  f"))
-	    (match_operand:VF 3 "register_operand"       " vr, vr, vr, vr"))
-	  (match_operand:VF 2 "vector_merge_operand"     " vu,  0, vu,  0")))]
+	    (match_operand:V_VLSF 3 "register_operand"   " vr, vr, vr, vr"))
+	  (match_operand:V_VLSF 2 "vector_merge_operand" " vu,  0, vu,  0")))]
   "TARGET_VECTOR"
   "vfr<insn>.vf\t%0,%3,%4%p1"
   [(set_attr "type" "<float_insn_type>")
@@ -8839,6 +8934,106 @@
   [(set_attr "type" "vssegt<order>x")
    (set_attr "mode" "<V32T:MODE>")])
 
+(define_insn "*pred_macc_<mode>_scalar_undef"
+  [(set (match_operand:V_VLSI_QHS       0 "register_operand"      "=vd,  vr")
+	(if_then_else:V_VLSI_QHS
+	  (unspec:<VM>
+	    [(match_operand:<VM>        1 "vector_mask_operand"   " vm, Wc1")
+	     (match_operand             6 "vector_length_operand" "rvl, rvl")
+	     (match_operand             7 "const_int_operand"     "  i,   i")
+	     (match_operand             8 "const_int_operand"     "  i,   i")
+	     (match_operand             9 "const_int_operand"     "  i,   i")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (plus:V_VLSI_QHS
+	    (mult:V_VLSI_QHS
+	      (vec_duplicate:V_VLSI_QHS
+		(match_operand:<VEL>    3 "reg_or_0_operand"     "  rJ,  rJ"))
+	      (match_operand:V_VLSI_QHS 4 "register_operand"     "  vr,  vr"))
+	    (match_operand:V_VLSI_QHS   5 "register_operand"     "   0,   0"))
+	  (match_operand:V_VLSI_QHS     2 "vector_undef_operand")))]
+  "TARGET_VECTOR"
+  "@
+   vmacc.vx\t%0,%z3,%4%p1
+   vmacc.vx\t%0,%z3,%4%p1"
+  [(set_attr "type" "vimuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*pred_macc_<mode>_scalar_undef"
+  [(set (match_operand:V_VLSI_D       0 "register_operand"      "=vd,  vr")
+	(if_then_else:V_VLSI_D
+	  (unspec:<VM>
+	    [(match_operand:<VM>      1 "vector_mask_operand"   " vm, Wc1")
+	     (match_operand           6 "vector_length_operand" "rvl, rvl")
+	     (match_operand           7 "const_int_operand"     "  i,   i")
+	     (match_operand           8 "const_int_operand"     "  i,   i")
+	     (match_operand           9 "const_int_operand"     "  i,   i")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (plus:V_VLSI_D
+	    (mult:V_VLSI_D
+	      (vec_duplicate:V_VLSI_D
+		(match_operand:<VEL>  3 "reg_or_0_operand"     "  rJ,  rJ"))
+	      (match_operand:V_VLSI_D 4 "register_operand"     "  vr,  vr"))
+	    (match_operand:V_VLSI_D   5 "register_operand"     "   0,   0"))
+	  (match_operand:V_VLSI_D     2 "vector_undef_operand")))]
+  "TARGET_VECTOR && TARGET_64BIT"
+  "@
+   vmacc.vx\t%0,%z3,%4%p1
+   vmacc.vx\t%0,%z3,%4%p1"
+  [(set_attr "type" "vimuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*pred_nmsac_<mode>_scalar_undef"
+  [(set (match_operand:V_VLSI_QHS       0 "register_operand"      "=vd,  vr")
+	(if_then_else:V_VLSI_QHS
+	  (unspec:<VM>
+	    [(match_operand:<VM>        1 "vector_mask_operand"   " vm, Wc1")
+	     (match_operand             6 "vector_length_operand" "rvl, rvl")
+	     (match_operand             7 "const_int_operand"     "  i,   i")
+	     (match_operand             8 "const_int_operand"     "  i,   i")
+	     (match_operand             9 "const_int_operand"     "  i,   i")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (minus:V_VLSI_QHS
+	    (match_operand:V_VLSI_QHS   5 "register_operand"     "   0,   0")
+	    (mult:V_VLSI_QHS
+	      (vec_duplicate:V_VLSI_QHS
+		(match_operand:<VEL>    3 "reg_or_0_operand"     "  rJ,  rJ"))
+	      (match_operand:V_VLSI_QHS 4 "register_operand"     "  vr,  vr")))
+	  (match_operand:V_VLSI_QHS     2 "vector_undef_operand")))]
+  "TARGET_VECTOR"
+  "@
+   vnmsac.vx\t%0,%z3,%4%p1
+   vnmsac.vx\t%0,%z3,%4%p1"
+  [(set_attr "type" "vimuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*pred_nmsac_<mode>_scalar_undef"
+  [(set (match_operand:V_VLSI_D       0 "register_operand"      "=vd,  vr")
+	(if_then_else:V_VLSI_D
+	  (unspec:<VM>
+	    [(match_operand:<VM>      1 "vector_mask_operand"   " vm, Wc1")
+	     (match_operand           6 "vector_length_operand" "rvl, rvl")
+	     (match_operand           7 "const_int_operand"     "  i,   i")
+	     (match_operand           8 "const_int_operand"     "  i,   i")
+	     (match_operand           9 "const_int_operand"     "  i,   i")
+	     (reg:SI VL_REGNUM)
+	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+	  (minus:V_VLSI_D
+	    (match_operand:V_VLSI_D   5 "register_operand"     "   0,   0")
+	    (mult:V_VLSI_D
+	      (vec_duplicate:V_VLSI_D
+		(match_operand:<VEL>  3 "reg_or_0_operand"     "  rJ,  rJ"))
+	      (match_operand:V_VLSI_D 4 "register_operand"     "  vr,  vr")))
+	  (match_operand:V_VLSI_D     2 "vector_undef_operand")))]
+  "TARGET_VECTOR && TARGET_64BIT"
+  "@
+   vnmsac.vx\t%0,%z3,%4%p1
+   vnmsac.vx\t%0,%z3,%4%p1"
+  [(set_attr "type" "vimuladd")
+   (set_attr "mode" "<MODE>")])
+
 (include "autovec.md")
 (include "autovec-opt.md")
 (include "sifive-vector.md")
diff --git a/gcc/config/riscv/xiangshan.md b/gcc/config/riscv/xiangshan.md
index 34b4a8f..6179140 100644
--- a/gcc/config/riscv/xiangshan.md
+++ b/gcc/config/riscv/xiangshan.md
@@ -144,13 +144,13 @@
 (define_insn_reservation "xiangshan_sfdiv" 11
   (and (eq_attr "tune" "xiangshan")
        (eq_attr "type" "fdiv")
-       (eq_attr "mode" "SF"))
+       (eq_attr "mode" "HF,SF"))
   "xs_fmisc_rs")
 
 (define_insn_reservation "xiangshan_sfsqrt" 17
   (and (eq_attr "tune" "xiangshan")
        (eq_attr "type" "fsqrt")
-       (eq_attr "mode" "SF"))
+       (eq_attr "mode" "HF,SF"))
   "xs_fmisc_rs")
 
 (define_insn_reservation "xiangshan_dfdiv" 21
diff --git a/gcc/config/rl78/rl78.opt.urls b/gcc/config/rl78/rl78.opt.urls
index 96eff5f..66e874b 100644
--- a/gcc/config/rl78/rl78.opt.urls
+++ b/gcc/config/rl78/rl78.opt.urls
@@ -4,7 +4,7 @@ msim
 UrlSuffix(gcc/RL78-Options.html#index-msim-6)
 
 mmul=
-UrlSuffix(gcc/RL78-Options.html#index-mmul)
+UrlSuffix(gcc/RL78-Options.html#index-mmul-1)
 
 mallregs
 UrlSuffix(gcc/RL78-Options.html#index-mallregs)
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 16227e5..8dd23f8 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -5174,6 +5174,7 @@ public:
 
 protected:
   void update_target_cost_per_stmt (vect_cost_for_stmt, stmt_vec_info,
+				    slp_tree node,
 				    vect_cost_model_location, unsigned int);
   void density_test (loop_vec_info);
   void adjust_vect_cost_per_loop (loop_vec_info);
@@ -5321,6 +5322,7 @@ rs6000_adjust_vect_cost_per_stmt (enum vect_cost_for_stmt kind,
 void
 rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind,
 					       stmt_vec_info stmt_info,
+					       slp_tree node,
 					       vect_cost_model_location where,
 					       unsigned int orig_count)
 {
@@ -5381,12 +5383,12 @@ rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind,
 	 or may not need to apply.  When finalizing the cost of the loop,
 	 the extra penalty is applied when the load density heuristics
 	 are satisfied.  */
-      if (kind == vec_construct && stmt_info
-	  && STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
-	  && (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
-	      || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_STRIDED_SLP))
+      if (kind == vec_construct && node
+	  && SLP_TREE_TYPE (node) == load_vec_info_type
+	  && (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+	      || SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP))
 	{
-	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+	  tree vectype = SLP_TREE_VECTYPE (node);
 	  unsigned int nunits = vect_nunits_for_cost (vectype);
 	  /* As PR103702 shows, it's possible that vectorizer wants to do
 	     costings for only one unit here, it's no need to do any
@@ -5415,7 +5417,7 @@ rs6000_cost_data::update_target_cost_per_stmt (vect_cost_for_stmt kind,
 
 unsigned
 rs6000_cost_data::add_stmt_cost (int count, vect_cost_for_stmt kind,
-				 stmt_vec_info stmt_info, slp_tree,
+				 stmt_vec_info stmt_info, slp_tree node,
 				 tree vectype, int misalign,
 				 vect_cost_model_location where)
 {
@@ -5433,7 +5435,7 @@ rs6000_cost_data::add_stmt_cost (int count, vect_cost_for_stmt kind,
       retval = adjust_cost_for_freq (stmt_info, where, count * stmt_cost);
       m_costs[where] += retval;
 
-      update_target_cost_per_stmt (kind, stmt_info, where, orig_count);
+      update_target_cost_per_stmt (kind, stmt_info, node, where, orig_count);
     }
 
   return retval;
@@ -10318,15 +10320,18 @@ can_be_rotated_to_negative_lis (HOST_WIDE_INT c, int *rot)
 
   /* case b. xx0..01..1xx: some of 15 x's (and some of 16 0's) are
      rotated over the highest bit.  */
-  int pos_one = clz_hwi ((c << 16) >> 16);
-  middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one));
-  int middle_ones = clz_hwi (~(c << pos_one));
-  if (middle_zeros >= 16 && middle_ones >= 33)
+  unsigned HOST_WIDE_INT uc = c;
+  int pos_one = clz_hwi ((HOST_WIDE_INT) (uc << 16) >> 16);
+  if (pos_one > 0 && pos_one < HOST_BITS_PER_WIDE_INT)
     {
-      *rot = pos_one;
-      return true;
+      middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one));
+      int middle_ones = clz_hwi (~(uc << pos_one));
+      if (middle_zeros >= 16 && middle_ones >= 33)
+	{
+	  *rot = pos_one;
+	  return true;
+	}
     }
-
   return false;
 }
 
@@ -10443,7 +10448,8 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask)
   if (lz >= HOST_BITS_PER_WIDE_INT)
     return false;
 
-  int middle_ones = clz_hwi (~(c << lz));
+  unsigned HOST_WIDE_INT uc = c;
+  int middle_ones = clz_hwi (~(uc << lz));
   if (tz + lz + middle_ones >= ones
       && (tz - lz) < HOST_BITS_PER_WIDE_INT
       && tz < HOST_BITS_PER_WIDE_INT)
@@ -10477,7 +10483,7 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask)
   if (!IN_RANGE (pos_first_1, 1, HOST_BITS_PER_WIDE_INT-1))
     return false;
 
-  middle_ones = clz_hwi (~c << pos_first_1);
+  middle_ones = clz_hwi ((~(unsigned HOST_WIDE_INT) c) << pos_first_1);
   middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_first_1));
   if (pos_first_1 < HOST_BITS_PER_WIDE_INT
       && middle_ones + middle_zeros < HOST_BITS_PER_WIDE_INT
@@ -10579,7 +10585,8 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c, int *num_insns)
     {
       /* li/lis; rldicX */
       unsigned HOST_WIDE_INT imm = (c | ~mask);
-      imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift));
+      if (shift > 0 && shift < HOST_BITS_PER_WIDE_INT)
+	imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift));
 
       count_or_emit_insn (temp, GEN_INT (imm));
       if (shift != 0)
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 9c718ca..04a6c0f 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1969,7 +1969,7 @@
   [(set (match_dup 0) (plus:GPR (match_dup 1) (match_dup 3)))
    (set (match_dup 0) (plus:GPR (match_dup 0) (match_dup 4)))]
 {
-  HOST_WIDE_INT val = INTVAL (operands[2]);
+  unsigned HOST_WIDE_INT val = UINTVAL (operands[2]);
   HOST_WIDE_INT low = sext_hwi (val, 16);
   HOST_WIDE_INT rest = trunc_int_for_mode (val - low, <MODE>mode);
 
@@ -15665,10 +15665,10 @@
 	(if_then_else:SI (lt (match_dup 3)
 			     (const_int 0))
 			 (const_int -1)
-			 (if_then_else (gt (match_dup 3)
-					   (const_int 0))
-				       (const_int 1)
-				       (const_int 0))))]
+			 (if_then_else:SI (gt (match_dup 3)
+					      (const_int 0))
+					  (const_int 1)
+					  (const_int 0))))]
   "TARGET_P9_MISC"
 {
   operands[3] = gen_reg_rtx (CCmode);
@@ -15703,10 +15703,10 @@
 	 (if_then_else:SI (lt (match_operand:CC 1 "cc_reg_operand" "y")
 			      (const_int 0))
 			  (const_int -1)
-			  (if_then_else (gt (match_dup 1)
-					    (const_int 0))
-					(const_int 1)
-					(const_int 0))))]
+			  (if_then_else:SI (gt (match_dup 1)
+					       (const_int 0))
+					   (const_int 1)
+					   (const_int 0))))]
   "TARGET_P9_MISC"
   "setb %0,%1"
   [(set_attr "type" "logical")])
@@ -15716,10 +15716,10 @@
 	 (if_then_else:SI (ltu (match_operand:CCUNS 1 "cc_reg_operand" "y")
 			      (const_int 0))
 			  (const_int -1)
-			  (if_then_else (gtu (match_dup 1)
-					    (const_int 0))
-					(const_int 1)
-					(const_int 0))))]
+			  (if_then_else:SI (gtu (match_dup 1)
+						(const_int 0))
+					   (const_int 1)
+					   (const_int 0))))]
   "TARGET_P9_MISC"
   "setb %0,%1"
   [(set_attr "type" "logical")])
@@ -15751,10 +15751,10 @@
 	(if_then_else:SI (lt (match_dup 3)
 			     (const_int 0))
 			 (const_int -1)
-			 (if_then_else (gt (match_dup 3)
-					   (const_int 0))
-				       (const_int 1)
-				       (const_int 0))))]
+			 (if_then_else:SI (gt (match_dup 3)
+					      (const_int 0))
+					  (const_int 1)
+					  (const_int 0))))]
   "TARGET_P9_MISC"
 {
   operands[3] = gen_reg_rtx (CCmode);
@@ -15807,10 +15807,10 @@
 	(if_then_else:SI (lt (match_dup 3)
 			     (const_int 0))
 			 (const_int -1)
-			 (if_then_else (gt (match_dup 3)
-					   (const_int 0))
-				       (const_int 1)
-				       (const_int 0))))]
+			 (if_then_else:SI (gt (match_dup 3)
+					      (const_int 0))
+					  (const_int 1)
+					  (const_int 0))))]
   "TARGET_P9_MISC && TARGET_64BIT"
 {
   operands[3] = gen_reg_rtx (CCmode);
diff --git a/gcc/config/rx/rx.cc b/gcc/config/rx/rx.cc
index dd730dc..c563881 100644
--- a/gcc/config/rx/rx.cc
+++ b/gcc/config/rx/rx.cc
@@ -1648,16 +1648,20 @@ mark_frame_related (rtx insn)
 static void
 add_pop_cfi_notes (rtx_insn *insn, unsigned int high, unsigned int low)
 {
-  rtx t = plus_constant (Pmode, stack_pointer_rtx,
-                        (high - low + 1) * UNITS_PER_WORD);
+  rtx src = stack_pointer_rtx;
+  rtx t;
+  for (unsigned int i = low; i <= high; i++)
+    {
+      add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (word_mode, i));
+      if (i == FRAME_POINTER_REGNUM && frame_pointer_needed)
+	src = frame_pointer_rtx;
+    }
+  t = plus_constant (Pmode, src, (high - low + 1) * UNITS_PER_WORD);
   t = gen_rtx_SET (stack_pointer_rtx, t);
   add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
   RTX_FRAME_RELATED_P (insn) = 1;
-  for (unsigned int i = low; i <= high; i++)
-    add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (word_mode, i));
 }
 
-
 static bool
 ok_for_max_constant (HOST_WIDE_INT val)
 {
@@ -1816,36 +1820,17 @@ rx_expand_prologue (void)
 	}
     }
 
-  /* If needed, set up the frame pointer.  */
-  if (frame_pointer_needed)
-    gen_safe_add (frame_pointer_rtx, stack_pointer_rtx,
-		  GEN_INT (- (HOST_WIDE_INT) frame_size), true);
-
-  /* Allocate space for the outgoing args.
-     If the stack frame has not already been set up then handle this as well.  */
-  if (stack_size)
+  if (stack_size || frame_size)
     {
-      if (frame_size)
-	{
-	  if (frame_pointer_needed)
-	    gen_safe_add (stack_pointer_rtx, frame_pointer_rtx,
-			  GEN_INT (- (HOST_WIDE_INT) stack_size), true);
-	  else
-	    gen_safe_add (stack_pointer_rtx, stack_pointer_rtx,
-			  GEN_INT (- (HOST_WIDE_INT) (frame_size + stack_size)),
-			  true);
-	}
-      else
-	gen_safe_add (stack_pointer_rtx, stack_pointer_rtx,
-		      GEN_INT (- (HOST_WIDE_INT) stack_size), true);
+      gen_safe_add (stack_pointer_rtx, stack_pointer_rtx,
+		    GEN_INT (- (HOST_WIDE_INT) (stack_size + frame_size)),
+		    true);
     }
-  else if (frame_size)
+  if (frame_pointer_needed)
     {
-      if (! frame_pointer_needed)
-	gen_safe_add (stack_pointer_rtx, stack_pointer_rtx,
-		      GEN_INT (- (HOST_WIDE_INT) frame_size), true);
-      else
-	gen_safe_add (stack_pointer_rtx, frame_pointer_rtx, NULL_RTX, true);
+      gen_safe_add (frame_pointer_rtx, stack_pointer_rtx,
+		    GEN_INT ((HOST_WIDE_INT) stack_size),
+		    true);
     }
 }
 
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index d760a7e..6becad1 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -128,6 +128,8 @@ extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx);
 extern void s390_expand_vec_init (rtx, rtx);
 extern rtx s390_expand_merge_perm_const (machine_mode, bool);
 extern void s390_expand_merge (rtx, rtx, rtx, bool);
+extern void s390_expand_int_spaceship (rtx, rtx, rtx, rtx);
+extern void s390_expand_fp_spaceship (rtx, rtx, rtx, rtx);
 extern rtx s390_build_signbit_mask (machine_mode);
 extern rtx s390_return_addr_rtx (int, rtx);
 extern rtx s390_back_chain_rtx (void);
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index abe551c..1a47f47 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -8213,6 +8213,167 @@ s390_expand_atomic (machine_mode mode, enum rtx_code code,
 					       NULL_RTX, 1, OPTAB_DIRECT), 1);
 }
 
+/* Expand integer op0 = op1 <=> op2, i.e.,
+   op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : 1.
+
+   Signedness is specified by op3.  If op3 equals 1, then perform an unsigned
+   comparison, and if op3 equals -1, then perform a signed comparison.
+
+   For integer comparisons we strive for a sequence like
+   CR[L] ; LHI ; LOCHIL ; LOCHIH
+   where the first three instructions fit into a group.  */
+
+void
+s390_expand_int_spaceship (rtx op0, rtx op1, rtx op2, rtx op3)
+{
+  gcc_assert (op3 == const1_rtx || op3 == constm1_rtx);
+
+  rtx cc, cond_lt, cond_gt;
+  machine_mode cc_mode;
+  machine_mode mode = GET_MODE (op1);
+
+  /* Prior VXE3 emulate a 128-bit comparison by breaking it up into three
+     comparisons.  First test the high halfs.  In case they equal, then test
+     the low halfs.  Finally, test for equality.  Depending on the results
+     make use of LOCs.  */
+  if (mode == TImode && !TARGET_VXE3)
+    {
+      gcc_assert (TARGET_VX);
+      op1
+	= force_reg (V2DImode, simplify_gen_subreg (V2DImode, op1, TImode, 0));
+      op2
+	= force_reg (V2DImode, simplify_gen_subreg (V2DImode, op2, TImode, 0));
+      rtx lab = gen_label_rtx ();
+      rtx ccz = gen_rtx_REG (CCZmode, CC_REGNUM);
+      /* Compare high halfs for equality.
+	 VEC[L]G op1, op2 sets
+	   CC1 if high(op1) < high(op2)
+	 and
+	   CC2 if high(op1) > high(op2).  */
+      machine_mode cc_mode = op3 == const1_rtx ? CCUmode : CCSmode;
+      rtx lane0 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+      emit_insn (gen_rtx_SET (
+	gen_rtx_REG (cc_mode, CC_REGNUM),
+	gen_rtx_COMPARE (cc_mode,
+			 gen_rtx_VEC_SELECT (DImode, op1, lane0),
+			 gen_rtx_VEC_SELECT (DImode, op2, lane0))));
+      s390_emit_jump (lab, gen_rtx_NE (CCZmode, ccz, const0_rtx));
+      /* At this point we know that the high halfs equal.
+	 VCHLGS op2, op1 sets CC1 if low(op1) < low(op2)  */
+      emit_insn (gen_rtx_PARALLEL (
+	VOIDmode,
+	gen_rtvec (2,
+		   gen_rtx_SET (gen_rtx_REG (CCVIHUmode, CC_REGNUM),
+				gen_rtx_COMPARE (CCVIHUmode, op2, op1)),
+		   gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (V2DImode)))));
+      emit_label (lab);
+      emit_insn (gen_rtx_SET (op0, const1_rtx));
+      emit_insn (
+	gen_movsicc (op0,
+		     gen_rtx_LTU (CCUmode, gen_rtx_REG (CCUmode, CC_REGNUM),
+				  const0_rtx),
+		     constm1_rtx, op0));
+      /* Deal with the case where both halfs equal.  */
+      emit_insn (gen_rtx_PARALLEL (
+	VOIDmode,
+	gen_rtvec (2,
+		   gen_rtx_SET (gen_rtx_REG (CCVEQmode, CC_REGNUM),
+				gen_rtx_COMPARE (CCVEQmode, op1, op2)),
+		   gen_rtx_SET (gen_reg_rtx (V2DImode),
+				gen_rtx_EQ (V2DImode, op1, op2)))));
+      emit_insn (gen_movsicc (op0, gen_rtx_EQ (CCZmode, ccz, const0_rtx),
+			      const0_rtx, op0));
+      return;
+    }
+
+  if (mode == QImode || mode == HImode)
+    {
+      rtx_code extend = op3 == const1_rtx ? ZERO_EXTEND : SIGN_EXTEND;
+      op1 = simplify_gen_unary (extend, SImode, op1, mode);
+      op1 = force_reg (SImode, op1);
+      op2 = simplify_gen_unary (extend, SImode, op2, mode);
+      op2 = force_reg (SImode, op2);
+      mode = SImode;
+    }
+
+  if (op3 == const1_rtx)
+    {
+      cc_mode = CCUmode;
+      cc = gen_rtx_REG (cc_mode, CC_REGNUM);
+      cond_lt = gen_rtx_LTU (mode, cc, const0_rtx);
+      cond_gt = gen_rtx_GTU (mode, cc, const0_rtx);
+    }
+  else
+    {
+      cc_mode = CCSmode;
+      cc = gen_rtx_REG (cc_mode, CC_REGNUM);
+      cond_lt = gen_rtx_LT (mode, cc, const0_rtx);
+      cond_gt = gen_rtx_GT (mode, cc, const0_rtx);
+    }
+
+  emit_insn (gen_rtx_SET (cc, gen_rtx_COMPARE (cc_mode, op1, op2)));
+  emit_move_insn (op0, const0_rtx);
+  emit_insn (gen_movsicc (op0, cond_lt, constm1_rtx, op0));
+  emit_insn (gen_movsicc (op0, cond_gt, const1_rtx, op0));
+}
+
+/* Expand floating-point op0 = op1 <=> op2, i.e.,
+   op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : op1 > op2 ? 1 : -128.
+
+   If op3 equals const0_rtx, then we are interested in the compare only (see
+   test spaceship-fp-4.c).  Otherwise, op3 is a CONST_INT different than
+   const1_rtx and constm1_rtx which is used in order to set op0 for unordered.
+
+   Emit a branch-only solution, i.e., let if-convert fold the branches into
+   LOCs if applicable.  This has the benefit that the solution is also
+   applicable if we are only interested in the compare, i.e., if op3 equals
+   const0_rtx.
+ */
+
+void
+s390_expand_fp_spaceship (rtx op0, rtx op1, rtx op2, rtx op3)
+{
+  gcc_assert (op3 != const1_rtx && op3 != constm1_rtx);
+
+  machine_mode mode = GET_MODE (op1);
+  machine_mode cc_mode = s390_select_ccmode (LTGT, op1, op2);
+  rtx cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+  rtx cond_unordered = gen_rtx_UNORDERED (mode, cc_reg, const0_rtx);
+  rtx cond_eq = gen_rtx_EQ (mode, cc_reg, const0_rtx);
+  rtx cond_gt = gen_rtx_GT (mode, cc_reg, const0_rtx);
+  rtx_insn *insn;
+  rtx l_unordered = gen_label_rtx ();
+  rtx l_eq = gen_label_rtx ();
+  rtx l_gt = gen_label_rtx ();
+  rtx l_end = gen_label_rtx ();
+
+  s390_emit_compare (VOIDmode, LTGT, op1, op2);
+  if (!flag_finite_math_only)
+    {
+      insn = s390_emit_jump (l_unordered, cond_unordered);
+      add_reg_br_prob_note (insn, profile_probability::very_unlikely ());
+    }
+  insn = s390_emit_jump (l_eq, cond_eq);
+  add_reg_br_prob_note (insn, profile_probability::unlikely ());
+  insn = s390_emit_jump (l_gt, cond_gt);
+  add_reg_br_prob_note (insn, profile_probability::even ());
+  emit_move_insn (op0, constm1_rtx);
+  emit_jump (l_end);
+  emit_label (l_eq);
+  emit_move_insn (op0, const0_rtx);
+  emit_jump (l_end);
+  emit_label (l_gt);
+  emit_move_insn (op0, const1_rtx);
+  if (!flag_finite_math_only)
+    {
+      emit_jump (l_end);
+      emit_label (l_unordered);
+      rtx unord_val = op3 == const0_rtx ? GEN_INT (-128) : op3;
+      emit_move_insn (op0, unord_val);
+    }
+  emit_label (l_end);
+}
+
 /* This is called from dwarf2out.cc via TARGET_ASM_OUTPUT_DWARF_DTPREL.
    We need to emit DTP-relative relocations.  */
 
@@ -9078,15 +9239,12 @@ print_operand (FILE *file, rtx x, int code)
       else if (code == 'h')
 	fprintf (file, HOST_WIDE_INT_PRINT_DEC,
 		 ((CONST_WIDE_INT_ELT (x, 0) & 0xffff) ^ 0x8000) - 0x8000);
+      /* Support arbitrary _BitInt constants in asm statements.  */
+      else if (code == 0)
+	output_addr_const (file, x);
       else
-	{
-	  if (code == 0)
-	    output_operand_lossage ("invalid constant - try using "
-				    "an output modifier");
-	  else
-	    output_operand_lossage ("invalid constant for output modifier '%c'",
-				    code);
-	}
+	output_operand_lossage ("invalid constant for output modifier '%c'",
+				code);
       break;
     case CONST_VECTOR:
       switch (code)
@@ -18607,6 +18765,27 @@ s390_c_mode_for_floating_type (enum tree_index ti)
   return default_mode_for_floating_type (ti);
 }
 
+/* Return true if _BitInt(N) is supported and fill its details into *INFO.  */
+
+bool
+s390_bitint_type_info (int n, struct bitint_info *info)
+{
+  if (!TARGET_64BIT)
+    return false;
+  if (n <= 8)
+    info->limb_mode = QImode;
+  else if (n <= 16)
+    info->limb_mode = HImode;
+  else if (n <= 32)
+    info->limb_mode = SImode;
+  else
+    info->limb_mode = DImode;
+  info->abi_limb_mode = info->limb_mode;
+  info->big_endian = true;
+  info->extended = true;
+  return true;
+}
+
 /* Initialize GCC target structure.  */
 
 #undef  TARGET_ASM_ALIGNED_HI_OP
@@ -18928,6 +19107,9 @@ s390_c_mode_for_floating_type (enum tree_index ti)
 #undef TARGET_DOCUMENTATION_NAME
 #define TARGET_DOCUMENTATION_NAME "S/390"
 
+#undef TARGET_C_BITINT_TYPE_INFO
+#define TARGET_C_BITINT_TYPE_INFO s390_bitint_type_info
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-s390.h"
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 1edbfde..858387c 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -1527,6 +1527,27 @@
   operands[0] = SET_DEST (PATTERN (curr_insn));
 })
 
+; Restrict spaceship optab to z13 or later since there we have
+; LOAD HALFWORD IMMEDIATE ON CONDITION.
+
+(define_mode_iterator SPACESHIP_INT [(TI "TARGET_VX") DI SI HI QI])
+(define_expand "spaceship<mode>4"
+  [(match_operand:SI 0 "register_operand")
+   (match_operand:SPACESHIP_INT 1 "register_operand")
+   (match_operand:SPACESHIP_INT 2 "register_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_Z13 && TARGET_64BIT"
+  "s390_expand_int_spaceship (operands[0], operands[1], operands[2], operands[3]); DONE;")
+
+(define_mode_iterator SPACESHIP_BFP [TF DF SF])
+(define_expand "spaceship<mode>4"
+  [(match_operand:SI 0 "register_operand")
+   (match_operand:SPACESHIP_BFP 1 "register_operand")
+   (match_operand:SPACESHIP_BFP 2 "register_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_Z13 && TARGET_64BIT && TARGET_HARD_FLOAT"
+  "s390_expand_fp_spaceship (operands[0], operands[1], operands[2], operands[3]); DONE;")
+
 
 ; (TF|DF|SF|TD|DD|SD) instructions
 
@@ -5227,18 +5248,19 @@
 })
 
 (define_insn "*zero_extendsidi2"
-  [(set (match_operand:DI 0 "register_operand" "=d,d,d")
-        (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "d,T,b")))]
+  [(set (match_operand:DI 0 "register_operand" "=d,d,d,d")
+        (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "d,T,b,v")))]
   "TARGET_ZARCH"
   "@
    llgfr\t%0,%1
    llgf\t%0,%1
-   llgfrl\t%0,%1"
-  [(set_attr "op_type"      "RRE,RXY,RIL")
-   (set_attr "type"         "*,*,larl")
-   (set_attr "cpu_facility" "*,*,z10")
-   (set_attr "z10prop" "z10_fwd_E1,z10_fwd_A3,z10_fwd_A3")
-   (set_attr "relative_long" "*,*,yes")])
+   llgfrl\t%0,%1
+   vlgvf\t%0,%v1,0"
+  [(set_attr "op_type"      "RRE,RXY,RIL,VRS")
+   (set_attr "type"         "*,*,larl,*")
+   (set_attr "cpu_facility" "*,*,z10,vx")
+   (set_attr "z10prop" "z10_fwd_E1,z10_fwd_A3,z10_fwd_A3,*")
+   (set_attr "relative_long" "*,*,yes,*")])
 
 ;
 ; LLGT-type instructions (zero-extend from 31 bit to 64 bit).
@@ -5341,29 +5363,32 @@
 
 ; llhrl, llghrl
 (define_insn "*zero_extendhi<mode>2_z10"
-  [(set (match_operand:GPR 0 "register_operand" "=d,d,d")
-        (zero_extend:GPR (match_operand:HI 1 "nonimmediate_operand" "d,T,b")))]
+  [(set (match_operand:GPR 0 "register_operand" "=d,d,d,d")
+        (zero_extend:GPR (match_operand:HI 1 "nonimmediate_operand" "d,T,b,v")))]
   "TARGET_Z10"
   "@
    ll<g>hr\t%0,%1
    ll<g>h\t%0,%1
-   ll<g>hrl\t%0,%1"
-  [(set_attr "op_type"      "RXY,RRE,RIL")
-   (set_attr "type"         "*,*,larl")
-   (set_attr "cpu_facility" "*,*,z10")
-   (set_attr "z10prop" "z10_super_E1,z10_fwd_A3,z10_fwd_A3")
-   (set_attr "relative_long" "*,*,yes")])
+   ll<g>hrl\t%0,%1
+   vlgvh\t%0,%v1,0"
+  [(set_attr "op_type"      "RXY,RRE,RIL,VRS")
+   (set_attr "type"         "*,*,larl,*")
+   (set_attr "cpu_facility" "*,*,z10,vx")
+   (set_attr "z10prop" "z10_super_E1,z10_fwd_A3,z10_fwd_A3,*")
+   (set_attr "relative_long" "*,*,yes,*")])
 
 ; llhr, llcr, llghr, llgcr, llh, llc, llgh, llgc
 (define_insn "*zero_extend<HQI:mode><GPR:mode>2_extimm"
-  [(set (match_operand:GPR 0 "register_operand" "=d,d")
-        (zero_extend:GPR (match_operand:HQI 1 "nonimmediate_operand" "d,T")))]
+  [(set (match_operand:GPR 0 "register_operand" "=d,d,d")
+        (zero_extend:GPR (match_operand:HQI 1 "nonimmediate_operand" "d,T,v")))]
   "TARGET_EXTIMM"
   "@
    ll<g><hc>r\t%0,%1
-   ll<g><hc>\t%0,%1"
-  [(set_attr "op_type" "RRE,RXY")
-   (set_attr "z10prop" "z10_super_E1,z10_fwd_A3")])
+   ll<g><hc>\t%0,%1
+   vlgv<HQI:bhfgq>\t%0,%v1,0"
+  [(set_attr "op_type" "RRE,RXY,VRS")
+   (set_attr "cpu_facility" "*,*,vx")
+   (set_attr "z10prop" "z10_super_E1,z10_fwd_A3,*")])
 
 ; llgh, llgc
 (define_insn "*zero_extend<HQI:mode><GPR:mode>2"
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 12bbeb6..745634e 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -501,54 +501,6 @@
                         SIL,SIL,RI,RI,RRE,RRE,RIL,RR,RXY,RXY,RIL")])
 
 
-; Instructions vlgvb, vlgvh, vlgvf zero all remaining bits of a GPR, i.e.,
-; an implicit zero extend is done.
-
-(define_insn "*movdi<mode>_zero_extend_A"
-  [(set (match_operand:DI 0 "register_operand" "=d")
-	(zero_extend:DI (match_operand:SINT 1 "register_operand" "v")))]
-  "TARGET_VX"
-  "vlgv<bhfgq>\t%0,%v1,0"
-  [(set_attr "op_type" "VRS")])
-
-(define_insn "*movsi<mode>_zero_extend_A"
-  [(set (match_operand:SI 0 "register_operand" "=d")
-	(zero_extend:SI (match_operand:HQI 1 "register_operand" "v")))]
-  "TARGET_VX"
-  "vlgv<bhfgq>\t%0,%v1,0"
-  [(set_attr "op_type" "VRS")])
-
-(define_mode_iterator VLGV_DI [V1QI V2QI V4QI V8QI V16QI
-			       V1HI V2HI V4HI V8HI
-			       V1SI V2SI V4SI])
-(define_insn "*movdi<mode>_zero_extend_B"
-  [(set (match_operand:DI 0 "register_operand" "=d")
-	(zero_extend:DI (vec_select:<non_vec>
-			  (match_operand:VLGV_DI 1 "register_operand" "v")
-			  (parallel [(match_operand:SI 2 "const_int_operand" "n")]))))]
-  "TARGET_VX"
-{
-  operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1));
-  return "vlgv<bhfgq>\t%0,%v1,%Y2";
-}
-  [(set_attr "op_type" "VRS")
-   (set_attr "mnemonic" "vlgv<bhfgq>")])
-
-(define_mode_iterator VLGV_SI [V1QI V2QI V4QI V8QI V16QI
-			       V1HI V2HI V4HI V8HI])
-(define_insn "*movsi<mode>_zero_extend_B"
-  [(set (match_operand:SI 0 "register_operand" "=d")
-	(zero_extend:SI (vec_select:<non_vec>
-			  (match_operand:VLGV_SI 1 "register_operand" "v")
-			  (parallel [(match_operand:SI 2 "const_int_operand" "n")]))))]
-  "TARGET_VX"
-{
-  operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1));
-  return "vlgv<bhfgq>\t%0,%v1,%Y2";
-}
-  [(set_attr "op_type" "VRS")
-   (set_attr "mnemonic" "vlgv<bhfgq>")])
-
 ; vec_load_lanes?
 
 ; vec_store_lanes?
@@ -763,6 +715,42 @@
   DONE;
 })
 
+; Instructions vlgvb, vlgvh, vlgvf zero all remaining bits of a GPR, i.e.,
+; an implicit zero extend is done.
+
+(define_mode_iterator VLGV_DI [V1QI V2QI V4QI V8QI V16QI
+			       V1HI V2HI V4HI V8HI
+			       V1SI V2SI V4SI])
+(define_insn "*vec_extract<mode>_zero_extend"
+  [(set (match_operand:DI 0 "register_operand" "=d")
+	(zero_extend:DI (vec_select:<non_vec>
+			  (match_operand:VLGV_DI 1 "register_operand" "v")
+			  (parallel [(match_operand:SI 2 "nonmemory_operand" "an")]))))]
+  "TARGET_VX"
+{
+  if (CONST_INT_P (operands[2]))
+    operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1));
+  return "vlgv<bhfgq>\t%0,%v1,%Y2";
+}
+  [(set_attr "op_type" "VRS")
+   (set_attr "mnemonic" "vlgv<bhfgq>")])
+
+(define_mode_iterator VLGV_SI [V1QI V2QI V4QI V8QI V16QI
+			       V1HI V2HI V4HI V8HI])
+(define_insn "*vec_extract<mode>_zero_extend"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(zero_extend:SI (vec_select:<non_vec>
+			  (match_operand:VLGV_SI 1 "register_operand" "v")
+			  (parallel [(match_operand:SI 2 "nonmemory_operand" "an")]))))]
+  "TARGET_VX"
+{
+  if (CONST_INT_P (operands[2]))
+    operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1));
+  return "vlgv<bhfgq>\t%0,%v1,%Y2";
+}
+  [(set_attr "op_type" "VRS")
+   (set_attr "mnemonic" "vlgv<bhfgq>")])
+
 (define_insn "*vec_vllezlf<mode>"
   [(set (match_operand:V_HW_4              0 "register_operand" "=v")
 	(vec_concat:V_HW_4
diff --git a/gcc/config/xtensa/constraints.md b/gcc/config/xtensa/constraints.md
index 77c9571..727ec1e 100644
--- a/gcc/config/xtensa/constraints.md
+++ b/gcc/config/xtensa/constraints.md
@@ -130,7 +130,7 @@
  (and (match_code "mem")
       (match_test "smalloffset_mem_p (op)")))
 
-(define_memory_constraint "T"
+(define_special_memory_constraint "T"
  "Memory in a literal pool (addressable with an L32R instruction)."
  (and (match_code "mem")
       (match_test "!TARGET_CONST16 && constantpool_mem_p (op)")))
diff --git a/gcc/config/xtensa/predicates.md b/gcc/config/xtensa/predicates.md
index 9aeaba6..20160a4 100644
--- a/gcc/config/xtensa/predicates.md
+++ b/gcc/config/xtensa/predicates.md
@@ -189,6 +189,9 @@
 (define_predicate "ubranch_operator"
   (match_code "ltu,geu"))
 
+(define_predicate "alt_ubranch_operator"
+  (match_code "gtu,leu"))
+
 (define_predicate "boolean_operator"
   (match_code "eq,ne"))
 
diff --git a/gcc/config/xtensa/xtensa-protos.h b/gcc/config/xtensa/xtensa-protos.h
index 1f5dcf5..98e75c6 100644
--- a/gcc/config/xtensa/xtensa-protos.h
+++ b/gcc/config/xtensa/xtensa-protos.h
@@ -60,6 +60,7 @@ extern bool xtensa_tls_referenced_p (rtx);
 extern enum rtx_code xtensa_shlrd_which_direction (rtx, rtx);
 extern bool xtensa_split1_finished_p (void);
 extern void xtensa_split_DI_reg_imm (rtx *);
+extern char *xtensa_bswapsi2_output (rtx_insn *, const char *);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, int);
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 02554c5..f3b89de 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -2645,6 +2645,94 @@ xtensa_split_DI_reg_imm (rtx *operands)
 }
 
 
+/* Return the asm output string of bswapsi2_internal insn pattern.
+   It does this by scanning backwards for the BB from the specified insn,
+   and if an another bswapsi2_internal is found, it omits the instruction
+   to set SAR to 8. If not found, or if a CALL, JUMP, ASM, or other insn
+   that clobbers SAR is found first, prepend an instruction to set SAR to
+   8 as usual.  */
+
+static int
+xtensa_bswapsi2_output_1 (rtx_insn *insn)
+{
+  int icode;
+  rtx pat;
+  const char *iname;
+
+  /* CALL insn do not preserve SAR.
+     JUMP insn only appear at the end of BB, so they do not need to be
+     considered when scanning backwards.  */
+  if (CALL_P (insn))
+    return -1;
+
+  switch (icode = INSN_CODE (insn))
+    {
+    /* rotate insns clobber SAR.  */
+    case CODE_FOR_rotlsi3:
+    case CODE_FOR_rotrsi3:
+      return -1;
+    /* simple shift insns clobber SAR if non-immediate shift amounts.  */
+    case CODE_FOR_ashlsi3_internal:
+    case CODE_FOR_ashrsi3:
+    case CODE_FOR_lshrsi3:
+      if (! CONST_INT_P (XEXP (SET_SRC (PATTERN (insn)), 1)))
+	return -1;
+      break;
+    /* this insn always set SAR to 8.  */
+    case CODE_FOR_bswapsi2_internal:
+      return 1;
+    default:
+      break;
+    }
+
+  /* "*shift_per_byte" and "*shlrd_*" complex shift insns clobber SAR.  */
+  if (icode >= CODE_FOR_nothing
+      && (! strcmp (iname = insn_data[icode].name, "*shift_per_byte")
+	  || ! strncmp (iname, "*shlrd_", 7)))
+    return -1;
+
+  /* asm statements may also clobber SAR, so they are anything goes.  */
+  if (NONJUMP_INSN_P (insn))
+    switch (GET_CODE (pat = PATTERN (insn)))
+      {
+      case SET:
+	return GET_CODE (SET_SRC (pat)) == ASM_OPERANDS ? -1 : 0;
+      case PARALLEL:
+	return (GET_CODE (pat = XVECEXP (pat, 0, 0)) == SET
+		&& GET_CODE (SET_SRC (pat)) == ASM_OPERANDS)
+	       || GET_CODE (pat) == ASM_OPERANDS
+	       || GET_CODE (pat) == ASM_INPUT ? -1 : 0;
+      case ASM_OPERANDS:
+	return -1;
+      default:
+	break;
+    }
+
+  /* All other insns are not interested in SAR.  */
+  return 0;
+}
+
+char *
+xtensa_bswapsi2_output (rtx_insn *insn, const char *output)
+{
+  static char result[128];
+  int i;
+
+  strcpy (result, "ssai\t8\n\t");
+  while ((insn = prev_nonnote_nondebug_insn_bb (insn)))
+    if ((i = xtensa_bswapsi2_output_1 (insn)) < 0)
+      break;
+    else if (i > 0)
+      {
+	result[0] = '\0';
+	break;
+      }
+  strcat (result, output);
+
+  return result;
+}
+
+
 /* Try to split an integer value into what are suitable for two consecutive
    immediate addition instructions, ADDI or ADDMI.  */
 
@@ -4702,25 +4790,49 @@ static bool
 xtensa_is_insn_L32R_p (const rtx_insn *insn)
 {
   rtx pat, dest, src;
+  machine_mode mode;
 
-  /* "PATTERN (insn)" can be used without checking, see insn_cost()
-     in gcc/rtlanal.cc.  */
+  /* RTX insns that are not "(set (reg) ...)" cannot become L32R instructions:
+     - it is permitted to apply PATTERN() to the insn without validation.
+       See insn_cost() in gcc/rtlanal.cc.
+     - it is used register_operand() instead of REG() to identify things that
+       don't look like REGs but will eventually become so as well.  */
   if (GET_CODE (pat = PATTERN (insn)) != SET
       || ! register_operand (dest = SET_DEST (pat), VOIDmode))
     return false;
 
+  /* If the source is a reference to a literal pool entry, then the insn
+     obviously corresponds to an L32R instruction.  */
   if (constantpool_mem_p (src = SET_SRC (pat)))
     return true;
 
-  /* Return true if:
-     - CONST16 instruction is not configured, and
-     - the source is some constant, and also
-     - negation of "the source is integer and fits into the immediate
-       field".  */
-  return (!TARGET_CONST16
-	  && CONSTANT_P (src)
-	  && ! ((GET_MODE (dest) == SImode || GET_MODE (dest) == HImode)
-		&& CONST_INT_P (src) && xtensa_simm12b (INTVAL (src))));
+  /* Similarly, an insn whose source is not a constant obviously does not
+     correspond to L32R.  */
+  if (! CONSTANT_P (src))
+    return false;
+
+  /* If the source is a CONST_INT whose value fits into signed 12 bits, then
+     the insn corresponds to a MOVI instruction (rather than an L32R one),
+     regardless of the configuration of TARGET_CONST16 or
+     TARGET_AUTOLITPOOLS.  Note that the destination register can be non-
+     SImode.  */
+  if (((mode = GET_MODE (dest)) == SImode
+       || mode == HImode || mode == SFmode)
+      && CONST_INT_P (src) && xtensa_simm12b (INTVAL (src)))
+    return false;
+
+  /* If TARGET_CONST16 is configured, constants of the remaining forms
+     correspond to pairs of CONST16 instructions, not L32R.  */
+  if (TARGET_CONST16)
+    return false;
+
+  /* The last remaining form of constant is one of the following:
+     - CONST_INTs with large values
+     - floating-point constants
+     - symbolic constants
+     and is all handled by a relaxed MOVI instruction, which is later
+     converted to an L32R instruction by the assembler.  */
+  return true;
 }
 
 /* Compute a relative costs of RTL insns.  This is necessary in order to
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 629dfdd..52ffb16 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -88,6 +88,7 @@
 ;; This mode iterator allows the HI and QI patterns to be defined from
 ;; the same template.
 (define_mode_iterator HQI [HI QI])
+(define_mode_attr mode_bits [(HI "16") (QI "8")])
 
 ;; This mode iterator allows the SI and HI patterns to be defined from
 ;; the same template.
@@ -176,19 +177,18 @@
 ;; Addition.
 
 (define_insn "addsi3"
-  [(set (match_operand:SI 0 "register_operand" "=D,D,a,a,a")
-	(plus:SI (match_operand:SI 1 "register_operand" "%d,d,r,r,r")
-		 (match_operand:SI 2 "add_operand" "d,O,r,J,N")))]
-  ""
-  "@
-   add.n\t%0, %1, %2
-   addi.n\t%0, %1, %d2
-   add\t%0, %1, %2
-   addi\t%0, %1, %d2
-   addmi\t%0, %1, %x2"
-  [(set_attr "type"	"arith,arith,arith,arith,arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"2,2,3,3,3")])
+  [(set (match_operand:SI 0 "register_operand")
+	(plus:SI (match_operand:SI 1 "register_operand")
+		 (match_operand:SI 2 "add_operand")))]
+  ""
+  {@ [cons: =0, %1, 2; attrs: type, length]
+     [D, d, d; arith, 2] add.n\t%0, %1, %2
+     [D, d, O; arith, 2] addi.n\t%0, %1, %d2
+     [a, r, r; arith, 3] add\t%0, %1, %2
+     [a, r, J; arith, 3] addi\t%0, %1, %d2
+     [a, r, N; arith, 3] addmi\t%0, %1, %x2
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn "*addsubx"
   [(set (match_operand:SI 0 "register_operand" "=a")
@@ -392,18 +392,15 @@
    (set_attr "length"	"3")])
 
 (define_insn "<u>mulhisi3"
-  [(set (match_operand:SI 0 "register_operand" "=C,A")
-	(mult:SI (any_extend:SI
-		  (match_operand:HI 1 "register_operand" "%r,r"))
-		 (any_extend:SI
-		  (match_operand:HI 2 "register_operand" "r,r"))))]
+  [(set (match_operand:SI 0 "register_operand")
+	(mult:SI (any_extend:SI (match_operand:HI 1 "register_operand"))
+		 (any_extend:SI (match_operand:HI 2 "register_operand"))))]
   "TARGET_MUL16 || TARGET_MAC16"
-  "@
-   mul16<su>\t%0, %1, %2
-   <u>mul.aa.ll\t%1, %2"
-  [(set_attr "type"	"mul16,mac16")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
+  {@ [cons: =0, %1, 2; attrs: type, length]
+     [C, r, r; mul16, 3] mul16<su>\t%0, %1, %2
+     [A, r, r; mac16, 3] <u>mul.aa.ll\t%1, %2
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn "muladdhisi"
   [(set (match_operand:SI 0 "register_operand" "=A")
@@ -652,36 +649,15 @@
 })
 
 (define_insn "bswapsi2_internal"
-  [(set (match_operand:SI 0 "register_operand" "=a,&a")
-	(bswap:SI (match_operand:SI 1 "register_operand" "0,r")))
-   (clobber (match_scratch:SI 2 "=&a,X"))]
+  [(set (match_operand:SI 0 "register_operand")
+	(bswap:SI (match_operand:SI 1 "register_operand")))
+   (clobber (match_scratch:SI 2))]
   "!optimize_debug && optimize > 1 && !optimize_size"
-{
-  rtx_insn *prev_insn = prev_nonnote_nondebug_insn (insn);
-  const char *init = "ssai\t8\;";
-  static char result[128];
-  if (prev_insn && NONJUMP_INSN_P (prev_insn))
-    {
-      rtx x = PATTERN (prev_insn);
-      if (GET_CODE (x) == PARALLEL && XVECLEN (x, 0) == 2
-	  && GET_CODE (XVECEXP (x, 0, 0)) == SET
-	  && GET_CODE (XVECEXP (x, 0, 1)) == CLOBBER)
-	{
-	  x = XEXP (XVECEXP (x, 0, 0), 1);
-	  if (GET_CODE (x) == BSWAP && GET_MODE (x) == SImode)
-	    init = "";
-	}
-    }
-  sprintf (result,
-	   (which_alternative == 0)
-	   ? "%s" "srli\t%%2, %%1, 16\;src\t%%2, %%2, %%1\;src\t%%2, %%2, %%2\;src\t%%0, %%1, %%2"
-	   : "%s" "srli\t%%0, %%1, 16\;src\t%%0, %%0, %%1\;src\t%%0, %%0, %%0\;src\t%%0, %%1, %%0",
-	   init);
-  return result;
-}
-   [(set_attr "type"	"arith,arith")
-    (set_attr "mode"	"SI")
-    (set_attr "length"	"15,15")])
+  {@ [cons: =0, 1, =2; attrs: type, length]
+     [ a, 0, &a; arith, 15] << xtensa_bswapsi2_output (insn, "srli\t%2, %1, 16\;src\t%2, %2, %1\;src\t%2, %2, %2\;src\t%0, %1, %2");
+     [&a, r,  X; arith, 15] << xtensa_bswapsi2_output (insn, "srli\t%0, %1, 16\;src\t%0, %0, %1\;src\t%0, %0, %0\;src\t%0, %1, %0");
+  }
+  [(set_attr "mode" "SI")])
 
 (define_expand "bswapdi2"
   [(set (match_operand:DI 0 "register_operand" "")
@@ -742,16 +718,15 @@
 ;; Logical instructions.
 
 (define_insn "andsi3"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(and:SI (match_operand:SI 1 "register_operand" "%r,r")
-		(match_operand:SI 2 "mask_operand" "P,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(and:SI (match_operand:SI 1 "register_operand")
+		(match_operand:SI 2 "mask_operand")))]
   ""
-  "@
-   extui\t%0, %1, 0, %K2
-   and\t%0, %1, %2"
-  [(set_attr "type"	"arith,arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
+  {@ [cons: =0, %1, 2; attrs: type, length]
+     [a, r, P; arith, 3] extui\t%0, %1, 0, %K2
+     [a, r, r; arith, 3] and\t%0, %1, %2
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn_and_split "*andsi3_bitcmpl"
   [(set (match_operand:SI 0 "register_operand" "=a")
@@ -944,27 +919,15 @@
 
 ;; Zero-extend instructions.
 
-(define_insn "zero_extendhisi2"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(zero_extend:SI (match_operand:HI 1 "nonimmed_operand" "r,U")))]
-  ""
-  "@
-   extui\t%0, %1, 0, 16
-   %v1l16ui\t%0, %1"
-  [(set_attr "type"	"arith,load")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
-
-(define_insn "zero_extendqisi2"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(zero_extend:SI (match_operand:QI 1 "nonimmed_operand" "r,U")))]
+(define_insn "zero_extend<mode>si2"
+  [(set (match_operand:SI 0 "register_operand")
+	(zero_extend:SI (match_operand:HQI 1 "nonimmed_operand")))]
   ""
-  "@
-   extui\t%0, %1, 0, 8
-   %v1l8ui\t%0, %1"
-  [(set_attr "type"	"arith,load")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [a, r; arith, 3] extui\t%0, %1, 0, <mode_bits>
+     [a, U; load , 3] %v1l<mode_bits>ui\t%0, %1
+  }
+  [(set_attr "mode" "SI")])
 
 
 ;; Sign-extend instructions.
@@ -982,15 +945,14 @@
 })
 
 (define_insn "extendhisi2_internal"
-  [(set (match_operand:SI 0 "register_operand" "=B,a")
-	(sign_extend:SI (match_operand:HI 1 "sext_operand" "r,U")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(sign_extend:SI (match_operand:HI 1 "sext_operand")))]
   ""
-  "@
-   sext\t%0, %1, 15
-   %v1l16si\t%0, %1"
-  [(set_attr "type"	"arith,load")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [B, r; arith, 3] sext\t%0, %1, 15
+     [a, U; load , 3] %v1l16si\t%0, %1
+  }
+  [(set_attr "mode" "SI")])
 
 (define_expand "extendqisi2"
   [(set (match_operand:SI 0 "register_operand" "")
@@ -1327,29 +1289,28 @@
 })
 
 (define_insn "movsi_internal"
-  [(set (match_operand:SI 0 "nonimmed_operand" "=D,D,D,D,R,R,a,q,a,a,W,a,a,U,*a,*A")
-	(match_operand:SI 1 "move_operand" "M,D,d,R,D,d,r,r,I,Y,i,T,U,r,*A,*r"))]
+  [(set (match_operand:SI 0 "nonimmed_operand")
+	(match_operand:SI 1 "move_operand"))]
   "xtensa_valid_move (SImode, operands)"
-  "@
-   movi.n\t%0, %x1
-   mov.n\t%0, %1
-   mov.n\t%0, %1
-   %v1l32i.n\t%0, %1
-   %v0s32i.n\t%1, %0
-   %v0s32i.n\t%1, %0
-   mov\t%0, %1
-   movsp\t%0, %1
-   movi\t%0, %x1
-   movi\t%0, %1
-   const16\t%0, %t1\;const16\t%0, %b1
-   %v1l32r\t%0, %1
-   %v1l32i\t%0, %1
-   %v0s32i\t%1, %0
-   rsr\t%0, ACCLO
-   wsr\t%1, ACCLO"
-  [(set_attr "type"	"move,move,move,load,store,store,move,move,move,load,move,load,load,store,rsr,wsr")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"2,2,2,2,2,2,3,3,3,3,6,3,3,3,3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [ D,  M; move , 2] movi.n\t%0, %x1
+     [ D,  D; move , 2] mov.n\t%0, %1
+     [ D,  d; move , 2] ^
+     [ D,  R; load , 2] %v1l32i.n\t%0, %1
+     [ R,  D; store, 2] %v0s32i.n\t%1, %0
+     [ R,  d; store, 2] ^
+     [ a,  r; move , 3] mov\t%0, %1
+     [ q,  r; move , 3] movsp\t%0, %1
+     [ a,  I; move , 3] movi\t%0, %x1
+     [ a,  Y; load , 3] movi\t%0, %1
+     [ W,  i; move , 6] const16\t%0, %t1\;const16\t%0, %b1
+     [ a,  T; load , 3] %v1l32r\t%0, %1
+     [ a,  U; load , 3] %v1l32i\t%0, %1
+     [ U,  r; store, 3] %v0s32i\t%1, %0
+     [*a, *A; rsr  , 3] rsr\t%0, ACCLO
+     [*A, *r; wsr  , 3] wsr\t%1, ACCLO
+  }
+  [(set_attr "mode" "SI")])
 
 (define_split
   [(set (match_operand:SHI 0 "register_operand")
@@ -1399,23 +1360,22 @@
 })
 
 (define_insn "movhi_internal"
-  [(set (match_operand:HI 0 "nonimmed_operand" "=D,D,a,a,a,a,a,U,*a,*A")
-	(match_operand:HI 1 "move_operand" "M,d,r,I,Y,T,U,r,*A,*r"))]
+  [(set (match_operand:HI 0 "nonimmed_operand")
+	(match_operand:HI 1 "move_operand"))]
   "xtensa_valid_move (HImode, operands)"
-  "@
-   movi.n\t%0, %x1
-   mov.n\t%0, %1
-   mov\t%0, %1
-   movi\t%0, %x1
-   movi\t%0, %1
-   %v1l32r\t%0, %1
-   %v1l16ui\t%0, %1
-   %v0s16i\t%1, %0
-   rsr\t%0, ACCLO
-   wsr\t%1, ACCLO"
-  [(set_attr "type"	"move,move,move,move,load,load,load,store,rsr,wsr")
-   (set_attr "mode"	"HI")
-   (set_attr "length"	"2,2,3,3,3,3,3,3,3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [ D,  M; move , 2] movi.n\t%0, %x1
+     [ D,  d; move , 2] mov.n\t%0, %1
+     [ a,  r; move , 3] mov\t%0, %1
+     [ a,  I; move , 3] movi\t%0, %x1
+     [ a,  Y; load , 3] movi\t%0, %1
+     [ a,  T; load , 3] %v1l32r\t%0, %1
+     [ a,  U; load , 3] %v1l16ui\t%0, %1
+     [ U,  r; store, 3] %v0s16i\t%1, %0
+     [*a, *A; rsr  , 3] rsr\t%0, ACCLO
+     [*A, *r; wsr  , 3] wsr\t%1, ACCLO
+  }
+  [(set_attr "mode" "HI")])
 
 ;; 8-bit Integer moves
 
@@ -1429,21 +1389,20 @@
 })
 
 (define_insn "movqi_internal"
-  [(set (match_operand:QI 0 "nonimmed_operand" "=D,D,a,a,a,U,*a,*A")
-	(match_operand:QI 1 "move_operand" "M,d,r,I,U,r,*A,*r"))]
+  [(set (match_operand:QI 0 "nonimmed_operand")
+	(match_operand:QI 1 "move_operand"))]
   "xtensa_valid_move (QImode, operands)"
-  "@
-   movi.n\t%0, %x1
-   mov.n\t%0, %1
-   mov\t%0, %1
-   movi\t%0, %x1
-   %v1l8ui\t%0, %1
-   %v0s8i\t%1, %0
-   rsr\t%0, ACCLO
-   wsr\t%1, ACCLO"
-  [(set_attr "type"	"move,move,move,move,load,store,rsr,wsr")
-   (set_attr "mode"	"QI")
-   (set_attr "length"	"2,2,3,3,3,3,3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [ D,  M; move , 2] movi.n\t%0, %x1
+     [ D,  d; move , 2] mov.n\t%0, %1
+     [ a,  r; move , 3] mov\t%0, %1
+     [ a,  I; move , 3] movi\t%0, %x1
+     [ a,  U; load , 3] %v1l8ui\t%0, %1
+     [ U,  r; store, 3] %v0s8i\t%1, %0
+     [*a, *A; rsr  , 3] rsr\t%0, ACCLO
+     [*A, *r; wsr  , 3] wsr\t%1, ACCLO
+  }
+  [(set_attr "mode" "QI")])
 
 ;; Sub-word reloads from the constant pool.
 
@@ -1501,30 +1460,29 @@
 })
 
 (define_insn "movsf_internal"
-  [(set (match_operand:SF 0 "nonimmed_operand" "=f,f,U,D,a,D,R,a,f,a,a,W,a,U")
-	(match_operand:SF 1 "move_operand" "f,^U,f,d,T,R,d,r,r,f,Y,iF,U,r"))]
+  [(set (match_operand:SF 0 "nonimmed_operand")
+	(match_operand:SF 1 "move_operand"))]
   "((register_operand (operands[0], SFmode)
      || register_operand (operands[1], SFmode))
     && !(FP_REG_P (xt_true_regnum (operands[0]))
 	 && (constantpool_mem_p (operands[1]) || CONSTANT_P (operands[1]))))"
-  "@
-   mov.s\t%0, %1
-   %v1lsi\t%0, %1
-   %v0ssi\t%1, %0
-   mov.n\t%0, %1
-   %v1l32r\t%0, %1
-   %v1l32i.n\t%0, %1
-   %v0s32i.n\t%1, %0
-   mov\t%0, %1
-   wfr\t%0, %1
-   rfr\t%0, %1
-   movi\t%0, %y1
-   const16\t%0, %t1\;const16\t%0, %b1
-   %v1l32i\t%0, %1
-   %v0s32i\t%1, %0"
-  [(set_attr "type"	"farith,fload,fstore,move,load,load,store,move,farith,farith,load,move,load,store")
-   (set_attr "mode"	"SF")
-   (set_attr "length"	"3,3,3,2,3,2,2,3,3,3,3,6,3,3")])
+  {@ [cons: =0, 1; attrs: type, length]
+     [f,  f; farith, 3] mov.s\t%0, %1
+     [f, ^U; fload , 3] %v1lsi\t%0, %1
+     [U,  f; fstore, 3] %v0ssi\t%1, %0
+     [D,  d; move  , 2] mov.n\t%0, %1
+     [a,  T; load  , 3] %v1l32r\t%0, %1
+     [D,  R; load  , 2] %v1l32i.n\t%0, %1
+     [R,  d; store , 2] %v0s32i.n\t%1, %0
+     [a,  r; move  , 3] mov\t%0, %1
+     [f,  r; farith, 3] wfr\t%0, %1
+     [a,  f; farith, 3] rfr\t%0, %1
+     [a,  Y; load  , 3] movi\t%0, %y1
+     [W, iF; move  , 6] const16\t%0, %t1\;const16\t%0, %b1
+     [a,  U; load  , 3] %v1l32i\t%0, %1
+     [U,  r; store , 3] %v0s32i\t%1, %0
+  }
+  [(set_attr "mode" "SF")])
 
 (define_insn "*lsiu"
   [(set (match_operand:SF 0 "register_operand" "=f")
@@ -1692,16 +1650,15 @@
 })
 
 (define_insn "ashlsi3_internal"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(ashift:SI (match_operand:SI 1 "register_operand" "r,r")
-		   (match_operand:SI 2 "arith_operand" "J,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(ashift:SI (match_operand:SI 1 "register_operand")
+		   (match_operand:SI 2 "arith_operand")))]
   ""
-  "@
-   slli\t%0, %1, %R2
-   ssl\t%2\;sll\t%0, %1"
-  [(set_attr "type"	"arith,arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,6")])
+  {@ [cons: =0, 1, 2; attrs: type, length]
+     [a, r, J; arith, 3] slli\t%0, %1, %R2
+     [a, r, r; arith, 6] ssl\t%2\;sll\t%0, %1
+  }
+  [(set_attr "mode" "SI")])
 
 (define_split
   [(set (match_operand:SI 0 "register_operand")
@@ -1713,35 +1670,26 @@
 		 (match_dup 1)))])
 
 (define_insn "ashrsi3"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(ashiftrt:SI (match_operand:SI 1 "register_operand" "r,r")
-		     (match_operand:SI 2 "arith_operand" "J,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(ashiftrt:SI (match_operand:SI 1 "register_operand")
+		     (match_operand:SI 2 "arith_operand")))]
   ""
-  "@
-   srai\t%0, %1, %R2
-   ssr\t%2\;sra\t%0, %1"
-  [(set_attr "type"	"arith,arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,6")])
+  {@ [cons: =0, 1, 2; attrs: type, length]
+     [a, r, J; arith, 3] srai\t%0, %1, %R2
+     [a, r, r; arith, 6] ssr\t%2\;sra\t%0, %1
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn "lshrsi3"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(lshiftrt:SI (match_operand:SI 1 "register_operand" "r,r")
-		     (match_operand:SI 2 "arith_operand" "J,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(lshiftrt:SI (match_operand:SI 1 "register_operand")
+		     (match_operand:SI 2 "arith_operand")))]
   ""
-{
-  if (which_alternative == 0)
-    {
-      if ((INTVAL (operands[2]) & 0x1f) < 16)
-	return "srli\t%0, %1, %R2";
-      else
-	return "extui\t%0, %1, %R2, %L2";
-    }
-  return "ssr\t%2\;srl\t%0, %1";
-}
-  [(set_attr "type"	"arith,arith")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"3,6")])
+  {@ [cons: =0, 1, 2; attrs: type, length]
+     [a, r, J; arith, 3] << (INTVAL (operands[2]) & 0x1f) < 16 ? \"srli\t%0, %1, %R2\" : \"extui\t%0, %1, %R2, %L2\";
+     [a, r, r; arith, 6] ssr\t%2\;srl\t%0, %1
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn "*shift_per_byte"
   [(set (match_operand:SI 0 "register_operand" "=a")
@@ -1944,28 +1892,26 @@
    (set_attr "length"	"6")])
 
 (define_insn "rotlsi3"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(rotate:SI (match_operand:SI 1 "register_operand" "r,r")
-		     (match_operand:SI 2 "arith_operand" "J,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(rotate:SI (match_operand:SI 1 "register_operand")
+		     (match_operand:SI 2 "arith_operand")))]
   ""
-  "@
-   ssai\t%L2\;src\t%0, %1, %1
-   ssl\t%2\;src\t%0, %1, %1"
-  [(set_attr "type"	"multi,multi")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"6,6")])
+  {@ [cons: =0, 1, 2; attrs: type, length]
+     [a, r, J; multi, 6] ssai\t%L2\;src\t%0, %1, %1
+     [a, r, r; multi, 6] ssl\t%2\;src\t%0, %1, %1
+  }
+  [(set_attr "mode" "SI")])
 
 (define_insn "rotrsi3"
-  [(set (match_operand:SI 0 "register_operand" "=a,a")
-	(rotatert:SI (match_operand:SI 1 "register_operand" "r,r")
-		     (match_operand:SI 2 "arith_operand" "J,r")))]
+  [(set (match_operand:SI 0 "register_operand")
+	(rotatert:SI (match_operand:SI 1 "register_operand")
+		     (match_operand:SI 2 "arith_operand")))]
   ""
-  "@
-   ssai\t%R2\;src\t%0, %1, %1
-   ssr\t%2\;src\t%0, %1, %1"
-  [(set_attr "type"	"multi,multi")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"6,6")])
+  {@ [cons: =0, 1, 2; attrs: type, length]
+     [a, r, J; multi, 6] ssai\t%R2\;src\t%0, %1, %1
+     [a, r, r; multi, 6] ssr\t%2\;src\t%0, %1, %1
+  }
+  [(set_attr "mode" "SI")])
 
 
 ;; Comparisons.
@@ -2024,26 +1970,23 @@
 			[(match_operand:SI 0 "register_operand" "r")
 			 (const_int -2147483648)])
 		      (label_ref (match_operand 1 ""))
-		      (pc)))]
+		      (pc)))
+   (clobber (match_scratch:SI 3 "=a"))]
   "TARGET_ABS"
   "#"
-  "&& can_create_pseudo_p ()"
+  "&& 1"
   [(set (match_dup 3)
 	(abs:SI (match_dup 0)))
    (set (pc)
 	(if_then_else (match_op_dup 2
-			[(zero_extract:SI (match_dup 3)
-					  (const_int 1)
-					  (match_dup 4))
+			[(match_dup 3)
 			 (const_int 0)])
 		      (label_ref (match_dup 1))
 		      (pc)))]
 {
-  operands[3] = gen_reg_rtx (SImode);
-  operands[4] = GEN_INT (BITS_BIG_ENDIAN ? 0 : 31);
-  operands[2] = gen_rtx_fmt_ee (reverse_condition (GET_CODE (operands[2])),
-				VOIDmode, XEXP (operands[2], 0),
-				const0_rtx);
+  if (GET_CODE (operands[3]) == SCRATCH)
+    operands[3] = gen_reg_rtx (SImode);
+  PUT_CODE (operands[2], GET_CODE (operands[2]) == EQ ? LT : GE);
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
@@ -2190,7 +2133,7 @@
 		      (label_ref (match_dup 1))
 		      (pc)))]
 {
-  operands[3] = GEN_INT ((1 << GET_MODE_BITSIZE (GET_MODE (operands[3]))) - 1);
+  operands[3] = GEN_INT (GET_MODE_MASK (GET_MODE (operands[3])));
 })
 
 (define_insn_and_split "*masktrue_const_pow2_minus_one"
@@ -3370,6 +3313,42 @@
 				    (const_int 8)
 				    (const_int 9))))])
 
+(define_insn_and_split "*eqne_in_range"
+  [(set (pc)
+	(if_then_else (match_operator 4 "alt_ubranch_operator"
+			[(plus:SI (match_operand:SI 0 "register_operand" "r")
+				  (match_operand:SI 1 "const_int_operand" "i"))
+			 (match_operand:SI 2 "const_int_operand" "i")])
+		      (label_ref (match_operand 3 ""))
+		      (pc)))
+   (clobber (match_scratch:SI 5 "=&a"))]
+  "TARGET_MINMAX && TARGET_CLAMPS
+   && INTVAL (operands[1]) * 2 - INTVAL (operands[2]) == 1
+   && IN_RANGE (exact_log2 (INTVAL (operands[1])), 7, 22)"
+  "#"
+  "&& 1"
+  [(set (match_dup 5)
+	(smin:SI (smax:SI (match_dup 0)
+			  (match_dup 1))
+		 (match_dup 2)))
+   (set (pc)
+	(if_then_else (match_op_dup 4
+			[(match_dup 0)
+			 (match_dup 5)])
+		      (label_ref (match_dup 3))
+		      (pc)))]
+{
+  HOST_WIDE_INT v = INTVAL (operands[1]);
+  operands[1] = GEN_INT (-v);
+  operands[2] = GEN_INT (v - 1);
+  PUT_CODE (operands[4], GET_CODE (operands[4]) == GTU ? NE : EQ);
+  if (GET_CODE (operands[5]) == SCRATCH)
+    operands[5] = gen_reg_rtx (SImode);
+}
+  [(set_attr "type"	"jump")
+   (set_attr "mode"	"none")
+   (set_attr "length"	"6")])
+
 (define_split
   [(clobber (match_operand 0 "register_operand"))]
   "HARD_REGISTER_P (operands[0])
author	Jerry DeLisle <jvdelisle@gcc.gnu.org>	2025-09-02 15:58:26 -0700
committer	Jerry DeLisle <jvdelisle@gcc.gnu.org>	2025-09-02 15:58:26 -0700
commit	071b4126c613881f4cb25b4e5c39032964827f88 (patch)
tree	7ed805786566918630d1d617b1ed8f7310f5fd8e /gcc/config
parent	845d23f3ea08ba873197c275a8857eee7edad996 (diff)
parent	caa1c2f42691d68af4d894a5c3e700ecd2dba080 (diff)
download	gcc-devel/gfortran-test.zip gcc-devel/gfortran-test.tar.gz gcc-devel/gfortran-test.tar.bz2