105 files changed, 3758 insertions, 176 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 7160c96..1181667 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,56 @@
+2025-07-06  Georg-Johann Lay  <avr@gjlay.de>
+
+	* config/avr/avr-mcus.def: -mmcu= takes lower case MCU names.
+	* doc/avr-mmcu.texi: Rebuild.
+
+2025-07-06  Georg-Johann Lay  <avr@gjlay.de>
+
+	* config/avr/avr-mcus.def (avr32da28S, avr32da32S, avr32da48S)
+	(avr64da28S, avr64da32S, avr64da48S avr64da64S)
+	(avr128da28S, avr128da32S, avr128da48S, avr128da64S): Add devices.
+	* doc/avr-mmcu.texi: Rebuild.
+
+2025-07-06  Andrew Pinski  <quic_apinski@quicinc.com>
+
+	PR tree-optimization/120951
+	* tree-call-cdce.cc (use_internal_fn): For non-call exceptions
+	with EQ_EXPR can throw for floating point types, then create
+	the EQ_EXPR seperately.
+
+2025-07-06  Andrew Pinski  <quic_apinski@quicinc.com>
+
+	PR middle-end/120921
+	* tree-cfg.cc (verify_gimple_assign_single): Reject constant and address expression LHS.
+	For non-empty vector constructors, make sure the LHS is an is_gimple_reg.
+
+2025-07-06  Jan Hubicka  <hubicka@ucw.cz>
+
+	* auto-profile.cc
+	(autofdo_source_profile::read): Scale cutoff.
+	(read_autofdo_file): Initialize cutoff
+	* coverage.cc (read_counts_file): Initialize cutoff to 1.
+	* gcov-io.h (struct gcov_summary): Add cutoff field.
+	* ipa-inline.cc (inline_small_functions): mac_count can be non-zero
+	also with auto_profile.
+	* lto-cgraph.cc (output_profile_summary): Write cutoff
+	and sum_max.
+	(input_profile_summary): Read cutoff and sum max.
+	(merge_profile_summaries): Initialize and scale global cutoffs
+	and sum max.
+	* profile-count.cc: Include profile.h
+	(profile_count::force_nonzero): move here from ...; use cutoff.
+	* profile-count.h: (profile_count::force_nonzero): ... here.
+
+2025-07-06  Jan Hubicka  <hubicka@ucw.cz>
+
+	* profile-count.cc (profile_count::operator*): fix overflow check.
+
+2025-07-05  Alexandre Oliva  <oliva@adacore.com>
+
+	* config/rs6000/vxworks.h (SUBTARGET_DRIVER_SELF_SPECS):
+	Redefine to select word size matching TARGET_VXWORKS64.
+	(TARGET_VXWORKS64): Redefine in terms of TARGET_64BIT.
+
 2025-07-04  Vineet Gupta  <vineetg@rivosinc.com>
 
 	PR target/118241
diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP
index 93909f8..d4353d1 100644
--- a/gcc/DATESTAMP
+++ b/gcc/DATESTAMP
@@ -1 +1 @@
-20250705
+20250707
diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 64f4cda..a970eb8 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -2522,6 +2522,7 @@ autofdo_source_profile::read ()
     afdo_count_scale
       = MAX (((gcov_type)1 << (profile_count::n_bits / 2))
 	     / afdo_profile_info->sum_max, 1);
+  afdo_profile_info->cutoff *= afdo_count_scale;
   afdo_hot_bb_threshod
     = hot_frac
       ? afdo_profile_info->sum_max * afdo_count_scale / hot_frac
@@ -2531,10 +2532,12 @@ autofdo_source_profile::read ()
     fprintf (dump_file, "Max count in profile %" PRIu64 "\n"
 			"Setting scale %" PRIu64 "\n"
 			"Scaled max count %" PRIu64 "\n"
+			"Cutoff %" PRIu64 "\n"
 			"Hot count threshold %" PRIu64 "\n\n",
 	     (int64_t)afdo_profile_info->sum_max,
 	     (int64_t)afdo_count_scale,
 	     (int64_t)(afdo_profile_info->sum_max * afdo_count_scale),
+	     (int64_t)afdo_profile_info->cutoff,
 	     (int64_t)afdo_hot_bb_threshod);
   afdo_profile_info->sum_max *= afdo_count_scale;
   return true;
@@ -3865,6 +3868,7 @@ read_autofdo_file (void)
   autofdo::afdo_profile_info = XNEW (gcov_summary);
   autofdo::afdo_profile_info->runs = 1;
   autofdo::afdo_profile_info->sum_max = 0;
+  autofdo::afdo_profile_info->cutoff = 1;
 
   /* Read the profile from the profile file.  */
   autofdo::read_profile ();
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index a2ce372..7f580a3 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -7799,11 +7799,17 @@ expand_builtin_crc_table_based (internal_fn fn, scalar_mode crc_mode,
 
   rtx op1 = expand_normal (rhs1);
   rtx op2 = expand_normal (rhs2);
-  gcc_assert (TREE_CODE (rhs3) == INTEGER_CST);
-  rtx op3 = gen_int_mode (TREE_INT_CST_LOW (rhs3), crc_mode);
+  rtx op3;
+  if (TREE_CODE (rhs3) != INTEGER_CST)
+    {
+      error ("third argument to %<crc%> builtins must be a constant");
+      op3 = const0_rtx;
+    }
+  else
+    op3 = convert_to_mode (crc_mode, expand_normal (rhs3), 0);
 
   if (CONST_INT_P (op2))
-    op2 = gen_int_mode (INTVAL (op2), crc_mode);
+    op2 = convert_to_mode (crc_mode, op2, 0);
 
   if (fn == IFN_CRC)
     expand_crc_table_based (target, op1, op2, op3, data_mode);
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 4aecb3a..6b5113e 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3966,7 +3966,7 @@
 )
 
 ;; Predicated predicate inverse.
-(define_insn "*one_cmpl<mode>3"
+(define_insn "@aarch64_pred_one_cmpl<mode>_z"
   [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
 	(and:PRED_ALL
 	  (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
@@ -8637,8 +8637,8 @@
 (define_expand "vec_cmp<mode><vpred>"
   [(set (match_operand:<VPRED> 0 "register_operand")
 	(match_operator:<VPRED> 1 "comparison_operator"
-	  [(match_operand:SVE_FULL_F 2 "register_operand")
-	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]))]
+	  [(match_operand:SVE_F 2 "register_operand")
+	   (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero")]))]
   "TARGET_SVE"
   {
     aarch64_expand_sve_vec_cmp_float (operands[0], GET_CODE (operands[1]),
@@ -8651,10 +8651,10 @@
 (define_insn "@aarch64_pred_fcm<cmp_op><mode>"
   [(set (match_operand:<VPRED> 0 "register_operand")
 	(unspec:<VPRED>
-	  [(match_operand:<VPRED> 1 "register_operand")
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
 	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-	   (match_operand:SVE_FULL_F 3 "register_operand")
-	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+	   (match_operand:SVE_F 3 "register_operand")
+	   (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
 	  SVE_COND_FP_CMP_I0))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 3 , 4   ]
@@ -8667,10 +8667,10 @@
 (define_insn "@aarch64_pred_fcmuo<mode>"
   [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
 	(unspec:<VPRED>
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	  [(match_operand:<VPRED> 1 "aarch64_predicate_operand" "Upl")
 	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-	   (match_operand:SVE_FULL_F 3 "register_operand" "w")
-	   (match_operand:SVE_FULL_F 4 "register_operand" "w")]
+	   (match_operand:SVE_F 3 "register_operand" "w")
+	   (match_operand:SVE_F 4 "register_operand" "w")]
 	  UNSPEC_COND_FCMUO))]
   "TARGET_SVE"
   "fcmuo\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
@@ -11437,16 +11437,12 @@
 
 (define_insn "@aarch64_sve_set_neonq_<mode>"
   [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-      (unspec:SVE_FULL
-	[(match_operand:SVE_FULL 1 "register_operand" "w")
-	(match_operand:<V128> 2 "register_operand" "w")
-	(match_operand:<VPRED> 3 "register_operand" "Upl")]
-	UNSPEC_SET_NEONQ))]
+	(unspec:SVE_FULL
+	  [(match_operand:SVE_FULL 1 "register_operand" "w")
+	   (match_operand:<V128> 2 "register_operand" "w")
+	   (match_operand:<VPRED> 3 "register_operand" "Upl")]
+	  UNSPEC_SET_NEONQ))]
   "TARGET_SVE
    && BYTES_BIG_ENDIAN"
-  {
-    operands[2] = lowpart_subreg (<MODE>mode, operands[2],
-                                  GET_MODE (operands[2]));
-    return "sel\t%0.<Vetype>, %3, %2.<Vetype>, %1.<Vetype>";
-  }
+  "sel\t%0.<Vetype>, %3, %Z2.<Vetype>, %1.<Vetype>"
 )
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f3ce3a1..7960b63 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24830,6 +24830,13 @@ aarch64_expand_vector_init (rtx target, rtx vals)
       emit_insn (rec_seq);
     }
 
+  /* The two halves should (by induction) be individually endian-correct.
+     However, in the memory layout provided by VALS, the nth element of
+     HALVES[0] comes immediately before the nth element HALVES[1].
+     This means that, on big-endian targets, the nth element of HALVES[0]
+     is more significant than the nth element HALVES[1].  */
+  if (BYTES_BIG_ENDIAN)
+    std::swap (halves[0], halves[1]);
   rtvec v = gen_rtvec (2, halves[0], halves[1]);
   rtx_insn *zip1_insn
     = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
@@ -27300,7 +27307,7 @@ aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
 			  bool known_ptrue_p, rtx op0, rtx op1)
 {
   rtx flag = gen_int_mode (known_ptrue_p, SImode);
-  rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
+  rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
 			       gen_rtvec (4, pred, flag, op0, op1),
 			       aarch64_unspec_cond_code (code));
   emit_set_insn (target, unspec);
@@ -27319,10 +27326,10 @@ static void
 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
 			      rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (pred);
-  rtx tmp1 = gen_reg_rtx (pred_mode);
+  machine_mode target_mode = GET_MODE (target);
+  rtx tmp1 = gen_reg_rtx (target_mode);
   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
-  rtx tmp2 = gen_reg_rtx (pred_mode);
+  rtx tmp2 = gen_reg_rtx (target_mode);
   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
 }
@@ -27339,8 +27346,7 @@ static void
 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
 				 bool known_ptrue_p, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (pred);
-  rtx tmp = gen_reg_rtx (pred_mode);
+  rtx tmp = gen_reg_rtx (GET_MODE (target));
   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
   aarch64_emit_unop (target, one_cmpl_optab, tmp);
 }
@@ -27352,10 +27358,25 @@ aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
 void
 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (target);
   machine_mode data_mode = GET_MODE (op0);
+  rtx pred = aarch64_sve_fp_pred (data_mode, nullptr);
 
-  rtx ptrue = aarch64_ptrue_reg (pred_mode);
+  /* The governing and destination modes.  */
+  machine_mode pred_mode = GET_MODE (pred);
+  machine_mode target_mode = GET_MODE (target);
+
+  /* For partial vector modes, the choice of predicate mode depends
+     on whether we need to suppress exceptions for inactive elements.
+     If we do need to suppress exceptions, the predicate mode matches
+     the element size rather than the container size and the predicate
+     marks the upper bits in each container as inactive.  The predicate
+     is then a ptrue wrt TARGET_MODE but not wrt PRED_MODE.  It is the
+     latter which matters here.
+
+     If we don't need to suppress exceptions, the predicate mode matches
+     the container size, PRED_MODE == TARGET_MODE, and the predicate is
+     thus a ptrue wrt both TARGET_MODE and PRED_MODE.  */
+  bool known_ptrue_p = pred_mode == target_mode;
   switch (code)
     {
     case UNORDERED:
@@ -27369,12 +27390,13 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
     case EQ:
     case NE:
       /* There is native support for the comparison.  */
-      aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
+      aarch64_emit_sve_fp_cond (target, code, pred, known_ptrue_p, op0, op1);
       return;
 
     case LTGT:
       /* This is a trapping operation (LT or GT).  */
-      aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
+      aarch64_emit_sve_or_fp_conds (target, LT, GT,
+				    pred, known_ptrue_p, op0, op1);
       return;
 
     case UNEQ:
@@ -27383,7 +27405,7 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
 	  /* This would trap for signaling NaNs.  */
 	  op1 = force_reg (data_mode, op1);
 	  aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
-					ptrue, true, op0, op1);
+					pred, known_ptrue_p, op0, op1);
 	  return;
 	}
       /* fall through */
@@ -27393,11 +27415,19 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
     case UNGE:
       if (flag_trapping_math)
 	{
-	  /* Work out which elements are ordered.  */
-	  rtx ordered = gen_reg_rtx (pred_mode);
 	  op1 = force_reg (data_mode, op1);
-	  aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
-					   ptrue, true, op0, op1);
+
+	  /* Work out which elements are unordered.  */
+	  rtx uo_tmp = gen_reg_rtx (target_mode);
+	  aarch64_emit_sve_fp_cond (uo_tmp, UNORDERED,
+				    pred, known_ptrue_p, op0, op1);
+
+	  /* Invert the result.  Governered by PRED so that we only
+	     flip the active bits.  */
+	  rtx ordered = gen_reg_rtx (pred_mode);
+	  uo_tmp = gen_lowpart (pred_mode, uo_tmp);
+	  emit_insn (gen_aarch64_pred_one_cmpl_z (pred_mode, ordered,
+						  pred, uo_tmp));
 
 	  /* Test the opposite condition for the ordered elements,
 	     then invert the result.  */
@@ -27422,7 +27452,8 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
 
   /* There is native support for the inverse comparison.  */
   code = reverse_condition_maybe_unordered (code);
-  aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
+  aarch64_emit_sve_invert_fp_cond (target, code,
+				   pred, known_ptrue_p, op0, op1);
 }
 
 /* Return true if:
diff --git a/gcc/config/avr/avr-mcus.def b/gcc/config/avr/avr-mcus.def
index ad64050..2e7c8ac 100644
--- a/gcc/config/avr/avr-mcus.def
+++ b/gcc/config/avr/avr-mcus.def
@@ -313,6 +313,10 @@ AVR_MCU ("avr64da28",        ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR
 AVR_MCU ("avr64da32",        ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR64DA32__",   0x6000, 0x0, 0x10000, 0)
 AVR_MCU ("avr64da48",        ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR64DA48__",   0x6000, 0x0, 0x10000, 0)
 AVR_MCU ("avr64da64",        ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR64DA64__",   0x6000, 0x0, 0x10000, 0)
+AVR_MCU ("avr64da28s",       ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR64DA28S__",  0x6000, 0x0, 0x10000, 0)
+AVR_MCU ("avr64da32s",       ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR64DA32S__",  0x6000, 0x0, 0x10000, 0)
+AVR_MCU ("avr64da48s",       ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR64DA48S__",  0x6000, 0x0, 0x10000, 0)
+AVR_MCU ("avr64da64s",       ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR64DA64S__",  0x6000, 0x0, 0x10000, 0)
 AVR_MCU ("avr64db28",        ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR64DB28__",   0x6000, 0x0, 0x10000, 0)
 AVR_MCU ("avr64db32",        ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR64DB32__",   0x6000, 0x0, 0x10000, 0)
 AVR_MCU ("avr64db48",        ARCH_AVRXMEGA2, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR64DB48__",   0x6000, 0x0, 0x10000, 0)
@@ -389,6 +393,9 @@ AVR_MCU ("avr16du32",        ARCH_AVRXMEGA3, AVR_CVT,                 "__AVR_AVR
 AVR_MCU ("avr32da28",        ARCH_AVRXMEGA3, AVR_CVT,                 "__AVR_AVR32DA28__",   0x7000, 0x0, 0x8000, 0x8000)
 AVR_MCU ("avr32da32",        ARCH_AVRXMEGA3, AVR_CVT,                 "__AVR_AVR32DA32__",   0x7000, 0x0, 0x8000, 0x8000)
 AVR_MCU ("avr32da48",        ARCH_AVRXMEGA3, AVR_CVT,                 "__AVR_AVR32DA48__",   0x7000, 0x0, 0x8000, 0x8000)
+AVR_MCU ("avr32da28s",       ARCH_AVRXMEGA3, AVR_CVT,                 "__AVR_AVR32DA28S__",  0x7000, 0x0, 0x8000, 0x8000)
+AVR_MCU ("avr32da32s",       ARCH_AVRXMEGA3, AVR_CVT,                 "__AVR_AVR32DA32S__",  0x7000, 0x0, 0x8000, 0x8000)
+AVR_MCU ("avr32da48s",       ARCH_AVRXMEGA3, AVR_CVT,                 "__AVR_AVR32DA48S__",  0x7000, 0x0, 0x8000, 0x8000)
 AVR_MCU ("avr32db28",        ARCH_AVRXMEGA3, AVR_CVT,                 "__AVR_AVR32DB28__",   0x7000, 0x0, 0x8000, 0x8000)
 AVR_MCU ("avr32db32",        ARCH_AVRXMEGA3, AVR_CVT,                 "__AVR_AVR32DB32__",   0x7000, 0x0, 0x8000, 0x8000)
 AVR_MCU ("avr32db48",        ARCH_AVRXMEGA3, AVR_CVT,                 "__AVR_AVR32DB48__",   0x7000, 0x0, 0x8000, 0x8000)
@@ -427,6 +434,10 @@ AVR_MCU ("avr128da28",       ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR
 AVR_MCU ("avr128da32",       ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR128DA32__",  0x4000, 0x0, 0x20000, 0)
 AVR_MCU ("avr128da48",       ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR128DA48__",  0x4000, 0x0, 0x20000, 0)
 AVR_MCU ("avr128da64",       ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR128DA64__",  0x4000, 0x0, 0x20000, 0)
+AVR_MCU ("avr128da28s",      ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR128DA28S__", 0x4000, 0x0, 0x20000, 0)
+AVR_MCU ("avr128da32s",      ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR128DA32S__", 0x4000, 0x0, 0x20000, 0)
+AVR_MCU ("avr128da48s",      ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR128DA48S__", 0x4000, 0x0, 0x20000, 0)
+AVR_MCU ("avr128da64s",      ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR128DA64S__", 0x4000, 0x0, 0x20000, 0)
 AVR_MCU ("avr128db28",       ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR128DB28__",  0x4000, 0x0, 0x20000, 0)
 AVR_MCU ("avr128db32",       ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR128DB32__",  0x4000, 0x0, 0x20000, 0)
 AVR_MCU ("avr128db48",       ARCH_AVRXMEGA4, AVR_CVT | AVR_ISA_FLMAP, "__AVR_AVR128DB48__",  0x4000, 0x0, 0x20000, 0)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 83076ad..8f15c1c 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -7899,7 +7899,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
 			       rtx count, machine_mode mode, int unroll,
 			       int expected_size, bool issetmem)
 {
-  rtx_code_label *out_label, *top_label;
+  rtx_code_label *out_label = nullptr;
+  rtx_code_label *top_label = nullptr;
   rtx iter, tmp;
   machine_mode iter_mode = counter_mode (count);
   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
@@ -7907,9 +7908,19 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
   rtx size;
   int i;
+  int loop_count;
 
-  top_label = gen_label_rtx ();
-  out_label = gen_label_rtx ();
+  if (expected_size != -1 && CONST_INT_P (count))
+    loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
+  else
+    loop_count = -1;
+
+  /* Don't generate the loop if the loop count is 1.  */
+  if (loop_count != 1)
+    {
+      top_label = gen_label_rtx ();
+      out_label = gen_label_rtx ();
+    }
   iter = gen_reg_rtx (iter_mode);
 
   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
@@ -7923,7 +7934,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
     }
   emit_move_insn (iter, const0_rtx);
 
-  emit_label (top_label);
+  if (loop_count != 1)
+    emit_label (top_label);
 
   tmp = convert_modes (Pmode, iter_mode, iter, true);
 
@@ -7991,21 +8003,25 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
-			   true, top_label);
-  if (expected_size != -1)
+  if (loop_count != 1)
     {
-      expected_size /= GET_MODE_SIZE (mode) * unroll;
-      if (expected_size == 0)
-	predict_jump (0);
-      else if (expected_size > REG_BR_PROB_BASE)
-	predict_jump (REG_BR_PROB_BASE - 1);
+      emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+			       true, top_label);
+      if (expected_size != -1)
+	{
+	  expected_size /= GET_MODE_SIZE (mode) * unroll;
+	  if (expected_size == 0)
+	    predict_jump (0);
+	  else if (expected_size > REG_BR_PROB_BASE)
+	    predict_jump (REG_BR_PROB_BASE - 1);
+	  else
+	    predict_jump (REG_BR_PROB_BASE
+			  - (REG_BR_PROB_BASE + expected_size / 2)
+			    / expected_size);
+	}
       else
-        predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
-		      / expected_size);
+	predict_jump (REG_BR_PROB_BASE * 80 / 100);
     }
-  else
-    predict_jump (REG_BR_PROB_BASE * 80 / 100);
   iter = ix86_zero_extend_to_Pmode (iter);
   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
 			     true, OPTAB_LIB_WIDEN);
@@ -8018,7 +8034,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
       if (tmp != srcptr)
 	emit_move_insn (srcptr, tmp);
     }
-  emit_label (out_label);
+  if (loop_count != 1)
+    emit_label (out_label);
 }
 
 /* Divide COUNTREG by SCALE.  */
@@ -8221,19 +8238,11 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem,
   rtx src, dest;
   if (CONST_INT_P (count))
     {
-      HOST_WIDE_INT countval = INTVAL (count);
-      HOST_WIDE_INT epilogue_size = countval % max_size;
-      int i;
-
-      /* For now MAX_SIZE should be a power of 2.  This assert could be
-	 relaxed, but it'll require a bit more complicated epilogue
-	 expanding.  */
-      gcc_assert ((max_size & (max_size - 1)) == 0);
-      for (i = max_size; i >= 1; i >>= 1)
-	{
-	  if (epilogue_size & i)
-	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
-	}
+      unsigned HOST_WIDE_INT countval = UINTVAL (count);
+      unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
+      unsigned int destalign = MEM_ALIGN (destmem);
+      move_by_pieces (destmem, srcmem, epilogue_size, destalign,
+		      RETURN_BEGIN);
       return;
     }
   if (max_size > 8)
@@ -8394,6 +8403,81 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
 				 1, max_size / 2, true);
 }
 
+/* Callback routine for store_by_pieces.  Return the RTL of a register
+   containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
+   is a word or a word vector register.  If PREV_P isn't nullptr, it
+   has the RTL info from the previous iteration.  */
+
+static rtx
+setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
+			 fixed_size_mode mode)
+{
+  rtx target;
+  by_pieces_prev *prev = (by_pieces_prev *) prev_p;
+  if (prev)
+    {
+      rtx prev_op = prev->data;
+      if (prev_op)
+	{
+	  machine_mode prev_mode = GET_MODE (prev_op);
+	  if (prev_mode == mode)
+	    return prev_op;
+	  if (VECTOR_MODE_P (prev_mode)
+	      && VECTOR_MODE_P (mode)
+	      && GET_MODE_INNER (prev_mode) == GET_MODE_INNER (mode))
+	    {
+	      target = gen_rtx_SUBREG (mode, prev_op, 0);
+	      return target;
+	    }
+	}
+    }
+
+  rtx op = (rtx) op_p;
+  machine_mode op_mode = GET_MODE (op);
+
+  gcc_assert (op_mode == word_mode
+	      || (VECTOR_MODE_P (op_mode)
+		  && GET_MODE_INNER (op_mode) == word_mode));
+
+  if (VECTOR_MODE_P (mode))
+    {
+      gcc_assert (GET_MODE_INNER (mode) == QImode);
+
+      unsigned int op_size = GET_MODE_SIZE (op_mode);
+      unsigned int size = GET_MODE_SIZE (mode);
+      unsigned int nunits = op_size / GET_MODE_SIZE (QImode);
+      machine_mode vec_mode
+	= mode_for_vector (QImode, nunits).require ();
+      target = gen_reg_rtx (vec_mode);
+      op = gen_rtx_SUBREG (vec_mode, op, 0);
+      emit_move_insn (target, op);
+      if (op_size == size)
+	return target;
+
+      rtx tmp = gen_reg_rtx (mode);
+      target = gen_rtx_SUBREG (mode, target, 0);
+      emit_move_insn (tmp, target);
+      return tmp;
+    }
+
+  target = gen_reg_rtx (word_mode);
+  if (VECTOR_MODE_P (op_mode))
+    {
+      op = gen_rtx_SUBREG (word_mode, op, 0);
+      emit_move_insn (target, op);
+    }
+  else
+    target = op;
+
+  if (mode == word_mode)
+    return target;
+
+  rtx tmp = gen_reg_rtx (mode);
+  target = gen_rtx_SUBREG (mode, target, 0);
+  emit_move_insn (tmp, target);
+  return tmp;
+}
+
 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
 static void
 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
@@ -8403,24 +8487,12 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
 
   if (CONST_INT_P (count))
     {
-      HOST_WIDE_INT countval = INTVAL (count);
-      HOST_WIDE_INT epilogue_size = countval % max_size;
-      int i;
-
-      /* For now MAX_SIZE should be a power of 2.  This assert could be
-	 relaxed, but it'll require a bit more complicated epilogue
-	 expanding.  */
-      gcc_assert ((max_size & (max_size - 1)) == 0);
-      for (i = max_size; i >= 1; i >>= 1)
-	{
-	  if (epilogue_size & i)
-	    {
-	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
-		destmem = emit_memset (destmem, destptr, vec_value, i);
-	      else
-		destmem = emit_memset (destmem, destptr, value, i);
-	    }
-	}
+      unsigned HOST_WIDE_INT countval = UINTVAL (count);
+      unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
+      unsigned int destalign = MEM_ALIGN (destmem);
+      store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
+		       vec_value ? vec_value : value, destalign, true,
+		       RETURN_BEGIN);
       return;
     }
   if (max_size > 32)
@@ -8552,6 +8624,7 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
   rtx modesize;
+  rtx scalar_value = value;
   int n;
 
   /* If we do not have vector value to copy, we must reduce size.  */
@@ -8571,11 +8644,57 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
     {
       /* Choose appropriate vector mode.  */
       if (size >= 32)
-	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+	switch (MOVE_MAX)
+	  {
+	  case 64:
+	    if (size >= 64)
+	      {
+		mode = V64QImode;
+		break;
+	      }
+	    /* FALLTHRU */
+	  case 32:
+	    mode = V32QImode;
+	    break;
+	  case 16:
+	    mode = V16QImode;
+	    break;
+	  case 8:
+	    mode = DImode;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }
       else if (size >= 16)
 	mode = TARGET_SSE ? V16QImode : DImode;
       srcmem = change_address (srcmem, mode, srcptr);
     }
+  if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
+    {
+      /* For memset with vector and the size is smaller than the vector
+	 size, first try the narrower vector, otherwise, use the
+	 original value. */
+      machine_mode inner_mode = GET_MODE_INNER (mode);
+      unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
+      if (nunits > 1)
+	{
+	  mode = mode_for_vector (GET_MODE_INNER (mode),
+				  nunits).require ();
+	  value = gen_rtx_SUBREG (mode, value, 0);
+	}
+      else
+	{
+	  scalar_int_mode smode
+	    = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
+	  gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
+		      >= GET_MODE_SIZE (smode));
+	  mode = smode;
+	  if (GET_MODE (scalar_value) == mode)
+	    value = scalar_value;
+	  else
+	    value = gen_rtx_SUBREG (mode, scalar_value, 0);
+	}
+    }
   destmem = change_address (destmem, mode, destptr);
   modesize = GEN_INT (GET_MODE_SIZE (mode));
   gcc_assert (GET_MODE_SIZE (mode) <= size);
@@ -9179,13 +9298,26 @@ decide_alignment (int align,
 static rtx
 promote_duplicated_reg (machine_mode mode, rtx val)
 {
+  if (val == const0_rtx)
+    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+
   machine_mode valmode = GET_MODE (val);
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      /* Duplicate the scalar value for integer vector.  */
+      gcc_assert ((val == const0_rtx || val == constm1_rtx)
+		  || GET_MODE_INNER (mode) == valmode);
+      rtx dup = gen_reg_rtx (mode);
+      bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
+						   val);
+      gcc_assert (ok);
+      return dup;
+    }
+
   rtx tmp;
   int nops = mode == DImode ? 3 : 2;
 
-  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
-  if (val == const0_rtx)
-    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+  gcc_assert (mode == SImode || mode == DImode);
   if (CONST_INT_P (val))
     {
       HOST_WIDE_INT v = INTVAL (val) & 255;
@@ -9413,11 +9545,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
     return false;
   gcc_assert (alg != no_stringop);
 
-  /* For now vector-version of memset is generated only for memory zeroing, as
-     creating of promoted vector value is very cheap in this case.  */
-  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
-    alg = unrolled_loop;
-
   if (!count)
     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
@@ -9510,20 +9637,41 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
        && ((desired_align > align && !align_bytes)
 	   || (!count && epilogue_size_needed > 1)));
 
+  /* Destination is aligned after the misaligned prologue.  */
+  bool aligned_dstmem = misaligned_prologue_used;
+
+  if (noalign && !misaligned_prologue_used)
+    {
+      /* Also use misaligned prologue if alignment isn't needed and
+	 destination isn't aligned.   Since alignment isn't needed,
+	 the destination after prologue won't be aligned.  */
+      aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
+			<= MEM_ALIGN (dst));
+      if (!aligned_dstmem)
+	misaligned_prologue_used = true;
+    }
+
   /* Do the cheap promotion to allow better CSE across the
      main loop and epilogue (ie one load of the big constant in the
      front of all code.
      For now the misaligned move sequences do not have fast path
      without broadcasting.  */
-  if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
+  if (issetmem
+      && (alg == vector_loop
+	  || CONST_INT_P (val_exp)
+	  || misaligned_prologue_used))
     {
       if (alg == vector_loop)
 	{
-	  gcc_assert (val_exp == const0_rtx);
-	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
 	  promoted_val = promote_duplicated_reg_to_size (val_exp,
 							 GET_MODE_SIZE (word_mode),
 							 desired_align, align);
+	  /* Duplicate the promoted scalar value if not 0 nor -1.  */
+	  vec_promoted_val
+	    = promote_duplicated_reg (move_mode,
+				      (val_exp == const0_rtx
+				       || val_exp == constm1_rtx)
+				      ? val_exp : promoted_val);
 	}
       else
 	{
@@ -9548,7 +9696,8 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
       if (!issetmem)
         src = change_address (src, BLKmode, srcreg);
       dst = change_address (dst, BLKmode, destreg);
-      set_mem_align (dst, desired_align * BITS_PER_UNIT);
+      if (aligned_dstmem)
+	set_mem_align (dst, desired_align * BITS_PER_UNIT);
       epilogue_size_needed = 0;
       if (need_zero_guard
 	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index a033120..38f63ea 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -137,6 +137,7 @@ extern void riscv_expand_usadd (rtx, rtx, rtx);
 extern void riscv_expand_ssadd (rtx, rtx, rtx);
 extern void riscv_expand_ussub (rtx, rtx, rtx);
 extern void riscv_expand_sssub (rtx, rtx, rtx);
+extern void riscv_expand_usmul (rtx, rtx, rtx);
 extern void riscv_expand_ustrunc (rtx, rtx);
 extern void riscv_expand_sstrunc (rtx, rtx);
 extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index ecdb61e..e09c189 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -13347,6 +13347,88 @@ riscv_expand_sssub (rtx dest, rtx x, rtx y)
   emit_move_insn (dest, gen_lowpart (mode, xmode_dest));
 }
 
+/* Implement the Xmode usmul.
+
+   b = SAT_MUL (a, b);
+   =>
+   _1 = a * b;
+   _2 = mulhu (a, b);
+   _overflow_p = _2 == 0;
+   _mask = - _overflow_p;
+   b = _1 | _mask;
+ */
+
+static void
+riscv_expand_xmode_usmul (rtx dest, rtx x, rtx y)
+{
+  machine_mode mode = GET_MODE (dest);
+
+  gcc_assert (mode == Xmode);
+
+  rtx mul = gen_reg_rtx (Xmode);
+  rtx mulhu = gen_reg_rtx (Xmode);
+  rtx overflow_p = gen_reg_rtx (Xmode);
+
+  riscv_emit_binary (MULT, mul, x, y);
+
+  if (TARGET_64BIT)
+    emit_insn (gen_usmuldi3_highpart (mulhu, x, y));
+  else
+    emit_insn (gen_usmulsi3_highpart (mulhu, x, y));
+
+  riscv_emit_binary (NE, overflow_p, mulhu, CONST0_RTX (Xmode));
+  riscv_emit_unary (NEG, overflow_p, overflow_p);
+  riscv_emit_binary (IOR, dest, mul, overflow_p);
+}
+
+/* Implement the non-Xmode usmul.
+
+   b = SAT_MUL (a, b);
+   =>
+   _1 = a * b;
+   _max = (T)-1
+   _overflow_p = _1 > _max;
+   _mask = - _overflow_p;
+   b = _1 | _mask;
+ */
+
+static void
+riscv_expand_non_xmode_usmul (rtx dest, rtx x, rtx y)
+{
+  machine_mode mode = GET_MODE (dest);
+  unsigned bitsize = GET_MODE_BITSIZE (mode).to_constant ();
+
+  gcc_assert (mode != Xmode);
+
+  rtx xmode_x = riscv_extend_to_xmode_reg (x, mode, ZERO_EXTEND);
+  rtx xmode_y = riscv_extend_to_xmode_reg (y, mode, ZERO_EXTEND);
+  rtx xmode_mul = gen_reg_rtx (Xmode);
+  rtx mul_max = gen_reg_rtx (Xmode);
+  rtx overflow_p = gen_reg_rtx (Xmode);
+
+  uint64_t max = ((uint64_t)1 << bitsize) - 1;
+
+  emit_move_insn (mul_max, GEN_INT (max));
+  riscv_emit_binary (MULT, xmode_mul, xmode_x, xmode_y);
+
+  riscv_emit_binary (LTU, overflow_p, mul_max, xmode_mul);
+  riscv_emit_unary (NEG, overflow_p, overflow_p);
+  riscv_emit_binary (IOR, xmode_mul, xmode_mul, overflow_p);
+
+  emit_move_insn (dest, gen_lowpart (mode, xmode_mul));
+}
+
+/* Implements the unsigned saturation mult standard name usmul for int mode.  */
+
+void
+riscv_expand_usmul (rtx dest, rtx x, rtx y)
+{
+  if (GET_MODE (dest) == Xmode)
+    return riscv_expand_xmode_usmul (dest, x, y) ;
+  else
+    return riscv_expand_non_xmode_usmul (dest, x, y);
+}
+
 /* Implement the unsigned saturation truncation for int mode.
 
    b = SAT_TRUNC (a);
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index f5ec0c5..c6661f5 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -4634,6 +4634,17 @@
   }
 )
 
+(define_expand "usmul<mode>3"
+  [(match_operand:ANYI 0 "register_operand")
+   (match_operand:ANYI 1 "register_operand")
+   (match_operand:ANYI 2 "register_operand")]
+  ""
+  {
+    riscv_expand_usmul (operands[0], operands[1], operands[2]);
+    DONE;
+  }
+)
+
 (define_expand "ustrunc<mode><anyi_double_truncated>2"
   [(match_operand:<ANYI_DOUBLE_TRUNCATED> 0 "register_operand")
    (match_operand:ANYI_DOUBLE_TRUNC       1 "register_operand")]
diff --git a/gcc/config/rs6000/vxworks.h b/gcc/config/rs6000/vxworks.h
index fa2c837b..e77247b 100644
--- a/gcc/config/rs6000/vxworks.h
+++ b/gcc/config/rs6000/vxworks.h
@@ -34,6 +34,21 @@ along with GCC; see the file COPYING3.  If not see
 /* Common definitions first.                                   */
 /*-------------------------------------------------------------*/
 
+/* Default to 64 bits when the target is powerpc64*-wrs-vxworks*,
+   and to 32 bits otherwise.  */
+#undef SUBTARGET_DRIVER_SELF_SPECS
+#if TARGET_VXWORKS64
+#define SUBTARGET_DRIVER_SELF_SPECS "%{!m64:%{!m32:-m64}}"
+#else
+#define SUBTARGET_DRIVER_SELF_SPECS "%{!m32:%{!m64:-m32}}"
+#endif
+
+/* Having used the build-time TARGET_VXWORKS64 to choose the default ABI above,
+   redefine it so that it matches whichever ABI is selected for each
+   compilation.  */
+#undef TARGET_VXWORKS64
+#define TARGET_VXWORKS64 TARGET_64BIT
+
 /* CPP predefined macros.  */
 
 #undef TARGET_OS_CPP_BUILTINS
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 97a4bdf..440ce93 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -139,9 +139,6 @@
    UNSPEC_LCBB
 
    ; Vector
-   UNSPEC_VEC_SMULT_HI
-   UNSPEC_VEC_UMULT_HI
-   UNSPEC_VEC_SMULT_LO
    UNSPEC_VEC_SMULT_EVEN
    UNSPEC_VEC_UMULT_EVEN
    UNSPEC_VEC_SMULT_ODD
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 7251a76..7c706ec 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -3576,3 +3576,29 @@
 ; vec_unpacks_float_lo
 ; vec_unpacku_float_hi
 ; vec_unpacku_float_lo
+
+(define_expand "avg<mode>3_ceil"
+  [(set (match_operand:VIT_HW_VXE3_T                        0 "register_operand")
+	(unspec:VIT_HW_VXE3_T [(match_operand:VIT_HW_VXE3_T 1 "register_operand")
+			       (match_operand:VIT_HW_VXE3_T 2 "register_operand")]
+			      UNSPEC_VEC_AVG))]
+  "TARGET_VX")
+
+(define_expand "uavg<mode>3_ceil"
+  [(set (match_operand:VIT_HW_VXE3_T                        0 "register_operand")
+	(unspec:VIT_HW_VXE3_T [(match_operand:VIT_HW_VXE3_T 1 "register_operand")
+			       (match_operand:VIT_HW_VXE3_T 2 "register_operand")]
+			      UNSPEC_VEC_AVGU))]
+  "TARGET_VX")
+
+(define_expand "smul<mode>3_highpart"
+  [(set (match_operand:VIT_HW_VXE3_DT 0 "register_operand")
+	(smul_highpart:VIT_HW_VXE3_DT (match_operand:VIT_HW_VXE3_DT 1 "register_operand")
+				      (match_operand:VIT_HW_VXE3_DT 2 "register_operand")))]
+  "TARGET_VX")
+
+(define_expand "umul<mode>3_highpart"
+  [(set (match_operand:VIT_HW_VXE3_DT 0 "register_operand")
+	(umul_highpart:VIT_HW_VXE3_DT (match_operand:VIT_HW_VXE3_DT 1 "register_operand")
+				      (match_operand:VIT_HW_VXE3_DT 2 "register_operand")))]
+  "TARGET_VX")
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index a7bb7ff..9e5d18b 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -982,20 +982,18 @@
 
 ; vmhb, vmhh, vmhf, vmhg, vmhq
 (define_insn "vec_smulh<mode>"
-  [(set (match_operand:VIT_HW_VXE3_DT 0 "register_operand"                       "=v")
-	(unspec:VIT_HW_VXE3_DT [(match_operand:VIT_HW_VXE3_DT 1 "register_operand" "v")
-				(match_operand:VIT_HW_VXE3_DT 2 "register_operand" "v")]
-			       UNSPEC_VEC_SMULT_HI))]
+  [(set (match_operand:VIT_HW_VXE3_DT 0 "register_operand"                              "=v")
+	(smul_highpart:VIT_HW_VXE3_DT (match_operand:VIT_HW_VXE3_DT 1 "register_operand" "v")
+				      (match_operand:VIT_HW_VXE3_DT 2 "register_operand" "v")))]
   "TARGET_VX"
   "vmh<bhfgq>\t%v0,%v1,%v2"
   [(set_attr "op_type" "VRR")])
 
 ; vmlhb, vmlhh, vmlhf, vmlhg, vmlhq
 (define_insn "vec_umulh<mode>"
-  [(set (match_operand:VIT_HW_VXE3_DT 0 "register_operand"                       "=v")
-	(unspec:VIT_HW_VXE3_DT [(match_operand:VIT_HW_VXE3_DT 1 "register_operand" "v")
-				(match_operand:VIT_HW_VXE3_DT 2 "register_operand" "v")]
-			       UNSPEC_VEC_UMULT_HI))]
+  [(set (match_operand:VIT_HW_VXE3_DT 0 "register_operand"                              "=v")
+	(umul_highpart:VIT_HW_VXE3_DT (match_operand:VIT_HW_VXE3_DT 1 "register_operand" "v")
+				      (match_operand:VIT_HW_VXE3_DT 2 "register_operand" "v")))]
   "TARGET_VX"
   "vmlh<bhfgq>\t%v0,%v1,%v2"
   [(set_attr "op_type" "VRR")])
diff --git a/gcc/coverage.cc b/gcc/coverage.cc
index dd3ed2e..75a24c6 100644
--- a/gcc/coverage.cc
+++ b/gcc/coverage.cc
@@ -238,6 +238,7 @@ read_counts_file (void)
 	  gcov_profile_info = profile_info = XCNEW (gcov_summary);
 	  profile_info->runs = gcov_read_unsigned ();
 	  profile_info->sum_max = gcov_read_unsigned ();
+	  profile_info->cutoff = 1;
 	}
       else if (GCOV_TAG_IS_COUNTER (tag) && fn_ident)
 	{
diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index cac74e3..44a7832 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -16919,6 +16919,15 @@ cp_parser_decomposition_declaration (cp_parser *parser,
       /* Ensure DECL_VALUE_EXPR is created for all the decls but
 	 the underlying DECL.  */
       cp_finish_decomp (decl, &decomp);
+      if (decl_spec_seq_has_spec_p (decl_specifiers, ds_thread))
+	pedwarn (decl_specifiers->locations[ds_thread],
+		 0, "for-range-declaration cannot be %qs",
+		 decl_specifiers->gnu_thread_keyword_p
+		 ? "__thread" : "thread_local");
+      else if (decl_specifiers->storage_class == sc_static)
+	pedwarn (decl_specifiers->locations[ds_storage_class],
+		 0, "for-range-declaration cannot be %qs",
+		 "static");
     }
 
   if (pushed_scope)
@@ -24162,7 +24171,26 @@ cp_parser_init_declarator (cp_parser* parser,
 	  && token->type != CPP_SEMICOLON)
 	{
 	  if (maybe_range_for_decl && *maybe_range_for_decl != error_mark_node)
-	    range_for_decl_p = true;
+	    {
+	      range_for_decl_p = true;
+	      if (decl_spec_seq_has_spec_p (decl_specifiers, ds_thread))
+		pedwarn (decl_specifiers->locations[ds_thread],
+			 0, "for-range-declaration cannot be %qs",
+			 decl_specifiers->gnu_thread_keyword_p
+			 ? "__thread" : "thread_local");
+	      else if (decl_specifiers->storage_class == sc_static)
+		pedwarn (decl_specifiers->locations[ds_storage_class],
+			 0, "for-range-declaration cannot be %qs",
+			 "static");
+	      else if (decl_specifiers->storage_class == sc_extern)
+		pedwarn (decl_specifiers->locations[ds_storage_class],
+			 0, "for-range-declaration cannot be %qs",
+			 "extern");
+	      else if (decl_specifiers->storage_class == sc_register)
+		pedwarn (decl_specifiers->locations[ds_storage_class],
+			 0, "for-range-declaration cannot be %qs",
+			 "register");
+	    }
 	  else
 	    {
 	      if (!maybe_range_for_decl)
diff --git a/gcc/doc/avr-mmcu.texi b/gcc/doc/avr-mmcu.texi
index feb7725..5efcc81 100644
--- a/gcc/doc/avr-mmcu.texi
+++ b/gcc/doc/avr-mmcu.texi
@@ -50,15 +50,15 @@
 
 @item @anchor{avrxmega2}avrxmega2
 ``XMEGA'' devices with more than 8@tie{}KiB and up to 64@tie{}KiB of program memory.
-@*@var{mcu}@tie{}= @code{atxmega8e5}, @code{atxmega16a4}, @code{atxmega16a4u}, @code{atxmega16c4}, @code{atxmega16d4}, @code{atxmega16e5}, @code{atxmega32a4}, @code{atxmega32a4u}, @code{atxmega32c3}, @code{atxmega32c4}, @code{atxmega32d3}, @code{atxmega32d4}, @code{atxmega32e5}, @code{avr64da28}, @code{avr64da32}, @code{avr64da48}, @code{avr64da64}, @code{avr64db28}, @code{avr64db32}, @code{avr64db48}, @code{avr64db64}, @code{avr64dd14}, @code{avr64dd20}, @code{avr64dd28}, @code{avr64dd32}, @code{avr64du28}, @code{avr64du32}, @code{avr64ea28}, @code{avr64ea32}, @code{avr64ea48}, @code{avr64sd28}, @code{avr64sd32}, @code{avr64sd48}.
+@*@var{mcu}@tie{}= @code{atxmega8e5}, @code{atxmega16a4}, @code{atxmega16a4u}, @code{atxmega16c4}, @code{atxmega16d4}, @code{atxmega16e5}, @code{atxmega32a4}, @code{atxmega32a4u}, @code{atxmega32c3}, @code{atxmega32c4}, @code{atxmega32d3}, @code{atxmega32d4}, @code{atxmega32e5}, @code{avr64da28}, @code{avr64da28s}, @code{avr64da32}, @code{avr64da32s}, @code{avr64da48}, @code{avr64da48s}, @code{avr64da64}, @code{avr64da64s}, @code{avr64db28}, @code{avr64db32}, @code{avr64db48}, @code{avr64db64}, @code{avr64dd14}, @code{avr64dd20}, @code{avr64dd28}, @code{avr64dd32}, @code{avr64du28}, @code{avr64du32}, @code{avr64ea28}, @code{avr64ea32}, @code{avr64ea48}, @code{avr64sd28}, @code{avr64sd32}, @code{avr64sd48}.
 
 @item @anchor{avrxmega3}avrxmega3
 ``XMEGA'' devices with up to 64@tie{}KiB of combined program memory and RAM, and with program memory visible in the RAM address space.
-@*@var{mcu}@tie{}= @code{attiny202}, @code{attiny204}, @code{attiny212}, @code{attiny214}, @code{attiny402}, @code{attiny404}, @code{attiny406}, @code{attiny412}, @code{attiny414}, @code{attiny416}, @code{attiny416auto}, @code{attiny417}, @code{attiny424}, @code{attiny426}, @code{attiny427}, @code{attiny804}, @code{attiny806}, @code{attiny807}, @code{attiny814}, @code{attiny816}, @code{attiny817}, @code{attiny824}, @code{attiny826}, @code{attiny827}, @code{attiny1604}, @code{attiny1606}, @code{attiny1607}, @code{attiny1614}, @code{attiny1616}, @code{attiny1617}, @code{attiny1624}, @code{attiny1626}, @code{attiny1627}, @code{attiny3214}, @code{attiny3216}, @code{attiny3217}, @code{attiny3224}, @code{attiny3226}, @code{attiny3227}, @code{atmega808}, @code{atmega809}, @code{atmega1608}, @code{atmega1609}, @code{atmega3208}, @code{atmega3209}, @code{atmega4808}, @code{atmega4809}, @code{avr16dd14}, @code{avr16dd20}, @code{avr16dd28}, @code{avr16dd32}, @code{avr16du14}, @code{avr16du20}, @code{avr16du28}, @code{avr16du32}, @code{avr16ea28}, @code{avr16ea32}, @code{avr16ea48}, @code{avr16eb14}, @code{avr16eb20}, @code{avr16eb28}, @code{avr16eb32}, @code{avr32da28}, @code{avr32da32}, @code{avr32da48}, @code{avr32db28}, @code{avr32db32}, @code{avr32db48}, @code{avr32dd14}, @code{avr32dd20}, @code{avr32dd28}, @code{avr32dd32}, @code{avr32du14}, @code{avr32du20}, @code{avr32du28}, @code{avr32du32}, @code{avr32ea28}, @code{avr32ea32}, @code{avr32ea48}, @code{avr32sd20}, @code{avr32sd28}, @code{avr32sd32}.
+@*@var{mcu}@tie{}= @code{attiny202}, @code{attiny204}, @code{attiny212}, @code{attiny214}, @code{attiny402}, @code{attiny404}, @code{attiny406}, @code{attiny412}, @code{attiny414}, @code{attiny416}, @code{attiny416auto}, @code{attiny417}, @code{attiny424}, @code{attiny426}, @code{attiny427}, @code{attiny804}, @code{attiny806}, @code{attiny807}, @code{attiny814}, @code{attiny816}, @code{attiny817}, @code{attiny824}, @code{attiny826}, @code{attiny827}, @code{attiny1604}, @code{attiny1606}, @code{attiny1607}, @code{attiny1614}, @code{attiny1616}, @code{attiny1617}, @code{attiny1624}, @code{attiny1626}, @code{attiny1627}, @code{attiny3214}, @code{attiny3216}, @code{attiny3217}, @code{attiny3224}, @code{attiny3226}, @code{attiny3227}, @code{atmega808}, @code{atmega809}, @code{atmega1608}, @code{atmega1609}, @code{atmega3208}, @code{atmega3209}, @code{atmega4808}, @code{atmega4809}, @code{avr16dd14}, @code{avr16dd20}, @code{avr16dd28}, @code{avr16dd32}, @code{avr16du14}, @code{avr16du20}, @code{avr16du28}, @code{avr16du32}, @code{avr16ea28}, @code{avr16ea32}, @code{avr16ea48}, @code{avr16eb14}, @code{avr16eb20}, @code{avr16eb28}, @code{avr16eb32}, @code{avr32da28}, @code{avr32da28s}, @code{avr32da32}, @code{avr32da32s}, @code{avr32da48}, @code{avr32da48s}, @code{avr32db28}, @code{avr32db32}, @code{avr32db48}, @code{avr32dd14}, @code{avr32dd20}, @code{avr32dd28}, @code{avr32dd32}, @code{avr32du14}, @code{avr32du20}, @code{avr32du28}, @code{avr32du32}, @code{avr32ea28}, @code{avr32ea32}, @code{avr32ea48}, @code{avr32sd20}, @code{avr32sd28}, @code{avr32sd32}.
 
 @item @anchor{avrxmega4}avrxmega4
 ``XMEGA'' devices with more than 64@tie{}KiB and up to 128@tie{}KiB of program memory.
-@*@var{mcu}@tie{}= @code{atxmega64a3}, @code{atxmega64a3u}, @code{atxmega64a4u}, @code{atxmega64b1}, @code{atxmega64b3}, @code{atxmega64c3}, @code{atxmega64d3}, @code{atxmega64d4}, @code{avr128da28}, @code{avr128da32}, @code{avr128da48}, @code{avr128da64}, @code{avr128db28}, @code{avr128db32}, @code{avr128db48}, @code{avr128db64}.
+@*@var{mcu}@tie{}= @code{atxmega64a3}, @code{atxmega64a3u}, @code{atxmega64a4u}, @code{atxmega64b1}, @code{atxmega64b3}, @code{atxmega64c3}, @code{atxmega64d3}, @code{atxmega64d4}, @code{avr128da28}, @code{avr128da28s}, @code{avr128da32}, @code{avr128da32s}, @code{avr128da48}, @code{avr128da48s}, @code{avr128da64}, @code{avr128da64s}, @code{avr128db28}, @code{avr128db32}, @code{avr128db48}, @code{avr128db64}.
 
 @item @anchor{avrxmega5}avrxmega5
 ``XMEGA'' devices with more than 64@tie{}KiB and up to 128@tie{}KiB of program memory and more than 64@tie{}KiB of RAM.
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 70adf2d..a119ad3 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -15553,7 +15553,7 @@ are 128-bit.  Only supported on targets when 128-bit types are supported.
 Returns the calculated 8-bit bit-reversed CRC using the initial CRC (8-bit),
 data (8-bit) and the polynomial (8-bit).
 @var{crc} is the initial CRC, @var{data} is the data and
-@var{poly} is the polynomial without leading 1.
+@var{poly} is the polynomial without leading 1. @var{poly} is required to be a compile-time constant.
 Table-based or clmul-based CRC may be used for the
 calculation, depending on the target architecture.
 @enddefbuiltin
@@ -15608,7 +15608,7 @@ is 32-bit.
 Returns the calculated 8-bit bit-forward CRC using the initial CRC (8-bit),
 data (8-bit) and the polynomial (8-bit).
 @var{crc} is the initial CRC, @var{data} is the data and
-@var{poly} is the polynomial without leading 1.
+@var{poly} is the polynomial without leading 1. @var{poly} is required to be a compile-time constant.
 Table-based or clmul-based CRC may be used for the
 calculation, depending on the target architecture.
 @enddefbuiltin
diff --git a/gcc/ext-dce.cc b/gcc/ext-dce.cc
index afe7afe..e7635fb 100644
--- a/gcc/ext-dce.cc
+++ b/gcc/ext-dce.cc
@@ -651,9 +651,8 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj,
 
 	  /* ?!? How much of this should mirror SET handling, potentially
 	     being shared?   */
-	  if (SUBREG_P (dst) && SUBREG_BYTE (dst).is_constant ())
+	  if (SUBREG_P (dst) && subreg_lsb (dst).is_constant (&bit))
 	    {
-	      bit = subreg_lsb (dst).to_constant ();
 	      if (bit >= HOST_BITS_PER_WIDE_INT)
 		bit = HOST_BITS_PER_WIDE_INT - 1;
 	      dst = SUBREG_REG (dst);
diff --git a/gcc/fortran/trans-intrinsic.cc b/gcc/fortran/trans-intrinsic.cc
index f1bfd3e..be98427 100644
--- a/gcc/fortran/trans-intrinsic.cc
+++ b/gcc/fortran/trans-intrinsic.cc
@@ -13101,6 +13101,8 @@ conv_intrinsic_move_alloc (gfc_code *code)
     }
   gfc_conv_expr_descriptor (&to_se, to_expr);
   gfc_conv_expr_descriptor (&from_se, from_expr);
+  gfc_add_block_to_block (&block, &to_se.pre);
+  gfc_add_block_to_block (&block, &from_se.pre);
 
   /* For coarrays, call SYNC ALL if TO is already deallocated as MOVE_ALLOC
      is an image control "statement", cf. IR F08/0040 in 12-006A.  */
@@ -13174,6 +13176,9 @@ conv_intrinsic_move_alloc (gfc_code *code)
   if (fin_label)
     gfc_add_expr_to_block (&block, build1_v (LABEL_EXPR, fin_label));
 
+  gfc_add_block_to_block (&block, &to_se.post);
+  gfc_add_block_to_block (&block, &from_se.post);
+
   return gfc_finish_block (&block);
 }
 
diff --git a/gcc/gcov-io.h b/gcc/gcov-io.h
index d48291c..f3e3a1c 100644
--- a/gcc/gcov-io.h
+++ b/gcc/gcov-io.h
@@ -349,6 +349,11 @@ struct gcov_summary
 {
   gcov_unsigned_t runs;		/* Number of program runs.  */
   gcov_type sum_max;    	/* Sum of individual run max values.  */
+  gcov_type cutoff;		/* Values smaller than this value are not
+				   reliable (0 may mean non-zero).
+				   For read profile cutoff is typically 1
+				   however when we scale up or use auto-fdo
+				   it may become bigger value.  */
 };
 
 #if !defined(inhibit_libc)
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 3f4ac93..ed6ef0e 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4031,9 +4031,14 @@ expand_crc_optab_fn (internal_fn fn, gcall *stmt, convert_optab optab)
   rtx dest = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
   rtx crc = expand_normal (rhs1);
   rtx data = expand_normal (rhs2);
-  gcc_assert (TREE_CODE (rhs3) == INTEGER_CST);
-  rtx polynomial = gen_rtx_CONST_INT (TYPE_MODE (result_type),
-				      TREE_INT_CST_LOW (rhs3));
+  rtx polynomial;
+  if (TREE_CODE (rhs3) != INTEGER_CST)
+    {
+      error ("third argument to %<crc%> builtins must be a constant");
+      polynomial = const0_rtx;
+    }
+  else
+    polynomial = convert_to_mode (TYPE_MODE (result_type), expand_normal (rhs3), 0);
 
   /* Use target specific expansion if it exists.
      Otherwise, generate table-based CRC.  */
@@ -4423,6 +4428,7 @@ commutative_binary_fn_p (internal_fn fn)
     case IFN_ADD_OVERFLOW:
     case IFN_MUL_OVERFLOW:
     case IFN_SAT_ADD:
+    case IFN_SAT_MUL:
     case IFN_VEC_WIDEN_PLUS:
     case IFN_VEC_WIDEN_PLUS_LO:
     case IFN_VEC_WIDEN_PLUS_HI:
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 8edfa35..914ee9f2 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -282,6 +282,7 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
 
 DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, binary)
 DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST, first, sssub, ussub, binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_MUL, ECF_CONST, first, ssmul, usmul, binary)
 
 DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_TRUNC, ECF_CONST, first, sstrunc, ustrunc, unary_convert)
 
diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc
index ca605b0..0cf97a80 100644
--- a/gcc/ipa-inline.cc
+++ b/gcc/ipa-inline.cc
@@ -2222,6 +2222,7 @@ inline_small_functions (void)
 
   gcc_assert (in_lto_p
 	      || !(max_count > 0)
+	      || flag_auto_profile
 	      || (profile_info && flag_branch_probabilities));
 
   while (!edge_heap.empty ())
diff --git a/gcc/lto-cgraph.cc b/gcc/lto-cgraph.cc
index ec34f65..0af2e88 100644
--- a/gcc/lto-cgraph.cc
+++ b/gcc/lto-cgraph.cc
@@ -718,11 +718,12 @@ output_profile_summary (struct lto_simple_output_block *ob)
 {
   if (profile_info)
     {
-      /* We do not output num and run_max, they are not used by
-         GCC profile feedback and they are difficult to merge from multiple
-         units.  */
       unsigned runs = (profile_info->runs);
       streamer_write_uhwi_stream (ob->main_stream, runs);
+      streamer_write_gcov_count_stream (ob->main_stream,
+					profile_info->sum_max);
+      streamer_write_gcov_count_stream (ob->main_stream,
+					profile_info->cutoff);
 
       /* IPA-profile computes hot bb threshold based on cumulated
 	 whole program profile.  We need to stream it down to ltrans.  */
@@ -1678,6 +1679,8 @@ input_profile_summary (class lto_input_block *ib,
   if (runs)
     {
       file_data->profile_info.runs = runs;
+      file_data->profile_info.sum_max = streamer_read_gcov_count (ib);
+      file_data->profile_info.cutoff = streamer_read_gcov_count (ib);
 
       /* IPA-profile computes hot bb threshold based on cumulated
 	 whole program profile.  We need to stream it down to ltrans.  */
@@ -1719,6 +1722,8 @@ merge_profile_summaries (struct lto_file_decl_data **file_data_vec)
 
   profile_info = XCNEW (gcov_summary);
   profile_info->runs = max_runs;
+  profile_info->sum_max = 0;
+  profile_info->cutoff = 0;
 
   /* If merging already happent at WPA time, we are done.  */
   if (flag_ltrans)
@@ -1735,6 +1740,14 @@ merge_profile_summaries (struct lto_file_decl_data **file_data_vec)
 
 	scale = RDIV (node->count_materialization_scale * max_runs,
                       node->lto_file_data->profile_info.runs);
+	gcov_type sum_max = RDIV (node->lto_file_data->profile_info.sum_max * max_runs,
+				  node->lto_file_data->profile_info.runs);
+	gcov_type cutoff = RDIV (node->lto_file_data->profile_info.cutoff * max_runs,
+				 node->lto_file_data->profile_info.runs);
+	if (sum_max > profile_info->sum_max)
+	  profile_info->sum_max = sum_max;
+	if (cutoff > profile_info->cutoff)
+	  profile_info->cutoff = cutoff;
 	node->count_materialization_scale = scale;
 	if (scale < 0)
 	  fatal_error (input_location, "Profile information in %s corrupted",
diff --git a/gcc/match.pd b/gcc/match.pd
index 10c2b97..ec2f560 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3583,6 +3583,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 	 || (wi::eq_p (int_cst_1, itype_max) && wi::eq_p (int_cst_2, limit_1)))
 	 && wi::eq_p (int_cst_3, otype_max)))))))
 
+/* Saturation mult for unsigned integer.  */
+(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
+  (match (unsigned_integer_sat_mul @0 @1)
+   /* SAT_U_MUL (X, Y) = {
+	WT x = (WT)a * (WT)b;
+	T max = -1;
+	if (x > (WT)(max))
+	  return max;
+	else
+	  return (T)x;
+      }
+      while WT is uint128_t, T is uint8_t, uint16_t, uint32_t or uint64_t.  */
+   (convert@4 (min (widen_mult:c@3 (convert@5 (convert @0))
+				   (convert@6 (convert @1)))
+		   INTEGER_CST@2))
+   (if (types_match (type, @0, @1) && types_match (type, @4))
+    (with
+     {
+      unsigned prec = TYPE_PRECISION (type);
+      unsigned widen_prec = TYPE_PRECISION (TREE_TYPE (@3));
+      unsigned cvt5_prec = TYPE_PRECISION (TREE_TYPE (@5));
+      unsigned cvt6_prec = TYPE_PRECISION (TREE_TYPE (@6));
+      unsigned hw_int_prec = sizeof (HOST_WIDE_INT) * 8;
+      wide_int c2 = wi::to_wide (@2);
+      wide_int max = wi::mask (prec, false, widen_prec);
+      bool c2_is_max_p = wi::eq_p (c2, max);
+      bool widen_mult_p = cvt5_prec == cvt6_prec && hw_int_prec == cvt5_prec;
+     }
+     (if (widen_prec > prec && c2_is_max_p && widen_mult_p)))))
+)
+
 /* The boundary condition for case 10: IMM = 1:
    SAT_U_SUB = X >= IMM ? (X - IMM) : 0.
    simplify (X != 0 ? X + ~0 : 0) to X - (X != 0).  */
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 0c1435d..87a8b85 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -134,8 +134,8 @@ OPTAB_NX(smul_optab, "mul$P$a3")
 OPTAB_NX(smul_optab, "mul$F$a3")
 OPTAB_VL(smulv_optab, "mulv$I$a3", MULT, "mul", '3', gen_intv_fp_libfunc)
 OPTAB_VX(smulv_optab, "mul$F$a3")
-OPTAB_NL(ssmul_optab, "ssmul$Q$a3", SS_MULT, "ssmul", '3', gen_signed_fixed_libfunc)
-OPTAB_NL(usmul_optab, "usmul$Q$a3", US_MULT, "usmul", '3', gen_unsigned_fixed_libfunc)
+OPTAB_NL(ssmul_optab, "ssmul$a3", SS_MULT, "ssmul", '3', gen_signed_fixed_libfunc)
+OPTAB_NL(usmul_optab, "usmul$a3", US_MULT, "usmul", '3', gen_unsigned_fixed_libfunc)
 OPTAB_NL(sdiv_optab, "div$a3", DIV, "div", '3', gen_int_fp_signed_fixed_libfunc)
 OPTAB_VL(sdivv_optab, "divv$I$a3", DIV, "divv", '3', gen_int_libfunc)
 OPTAB_VX(sdivv_optab, "div$F$a3")
diff --git a/gcc/profile-count.cc b/gcc/profile-count.cc
index 190bbeb..8f05a79 100644
--- a/gcc/profile-count.cc
+++ b/gcc/profile-count.cc
@@ -32,6 +32,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cgraph.h"
 #include "wide-int.h"
 #include "sreal.h"
+#include "profile.h"
 
 /* Names from profile_quality enum values.  */
 
@@ -557,7 +558,7 @@ profile_count::operator* (const sreal &num) const
   sreal scaled = num * m_val;
   gcc_checking_assert (scaled >= 0);
   profile_count ret;
-  if (m_val > max_count)
+  if (scaled > max_count)
     ret.m_val = max_count;
   else
     ret.m_val = scaled.to_nearest_int ();
@@ -570,3 +571,27 @@ profile_count::operator*= (const sreal &num)
 {
   return *this * num;
 }
+
+/* Make counter forcibly nonzero.  */
+profile_count
+profile_count::force_nonzero () const
+{
+  if (!initialized_p ())
+    return *this;
+  profile_count ret = *this;
+  /* Generally values are forced non-zero to handle inconsistent profile 
+     where count 0 needs to be scaled up to non-zero.
+
+     Use cutoff value here to avoid situation where profile has large
+     cutoff and we perform count = count * num / den where num is non-zero
+     and den is 0.   If profile was scaled by large factor, forcing value
+     to 1 would lead to large scale factor.  */
+  gcov_unsigned_t small = profile_info ? profile_info->cutoff / 2 + 1
+			  : 1;
+  if (ret.m_val < small)
+    {
+      ret.m_val = small;
+      ret.m_quality = MIN (m_quality, ADJUSTED);
+    }
+  return ret;
+}
diff --git a/gcc/profile-count.h b/gcc/profile-count.h
index 2160540..20c03a2 100644
--- a/gcc/profile-count.h
+++ b/gcc/profile-count.h
@@ -1112,18 +1112,7 @@ public:
     }
 
   /* Make counter forcibly nonzero.  */
-  profile_count force_nonzero () const
-    {
-      if (!initialized_p ())
-	return *this;
-      profile_count ret = *this;
-      if (ret.m_val == 0)
-	{
-	  ret.m_val = 1;
-	  ret.m_quality = MIN (m_quality, ADJUSTED);
-	}
-      return ret;
-    }
+  profile_count force_nonzero () const;
 
   profile_count max (profile_count other) const
     {
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index e234630..6ad847d 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,12 @@
+2025-07-06  Andrew Pinski  <quic_apinski@quicinc.com>
+
+	PR tree-optimization/120951
+	* gcc.dg/torture/pr120951-1.c: New test.
+
+2025-07-06  Jan Hubicka  <hubicka@ucw.cz>
+
+	* gcc.dg/tree-prof/clone-merge-1.c:
+
 2025-07-04  Vineet Gupta  <vineetg@rivosinc.com>
 
 	PR target/118241
diff --git a/gcc/testsuite/ada/acats-3/tests/c9/c94001c.ada b/gcc/testsuite/ada/acats-3/tests/c9/c94001c.ada
index 0cc14f4..df38f99 100644
--- a/gcc/testsuite/ada/acats-3/tests/c9/c94001c.ada
+++ b/gcc/testsuite/ada/acats-3/tests/c9/c94001c.ada
@@ -211,7 +211,7 @@ BEGIN
 
      BEGIN -- (E)
           WHILE NOT(OUT_TSK'TERMINATED) AND DELAY_COUNT < 60 LOOP
-               DELAY 1.0;
+               DELAY 1.0 * Impdef.One_Long_Second;
                DELAY_COUNT := DELAY_COUNT + 1;
           END LOOP;
           IF DELAY_COUNT = 60 THEN
@@ -254,7 +254,7 @@ BEGIN
 
      BEGIN
           WHILE NOT(OUT_TSK'TERMINATED) AND DELAY_COUNT < 60 LOOP
-               DELAY 1.0;
+               DELAY 1.0 * Impdef.One_Long_Second;
                DELAY_COUNT := DELAY_COUNT + 1;
           END LOOP;
           IF DELAY_COUNT = 60 THEN
diff --git a/gcc/testsuite/ada/acats-4/tests/c9/c940005.a b/gcc/testsuite/ada/acats-4/tests/c9/c940005.a
index adb58b1..47a97bf 100644
--- a/gcc/testsuite/ada/acats-4/tests/c9/c940005.a
+++ b/gcc/testsuite/ada/acats-4/tests/c9/c940005.a
@@ -85,7 +85,7 @@ begin
       -- In reality one would expect a time of 5 to 10 seconds.  In
       -- the interests of speeding up the test suite a shorter time
       -- is used
-      Pulse_Time_Delta : constant duration := ImpDef.Switch_To_New_Task;
+      Pulse_Time_Delta : constant duration := ImpDef.Long_Switch_To_New_Task;
 
       -- control over stopping tasks
       protected Control is
diff --git a/gcc/testsuite/ada/acats-4/tests/c9/c940007.a b/gcc/testsuite/ada/acats-4/tests/c9/c940007.a
index c678463..41e80f4 100644
--- a/gcc/testsuite/ada/acats-4/tests/c9/c940007.a
+++ b/gcc/testsuite/ada/acats-4/tests/c9/c940007.a
@@ -90,7 +90,7 @@ begin
       -- In reality one would expect a time of 5 to 10 seconds.  In
       -- the interests of speeding up the test suite a shorter time
       -- is used
-      Pulse_Time_Delta : constant duration := ImpDef.Switch_To_New_Task;
+      Pulse_Time_Delta : constant duration := ImpDef.Long_Switch_To_New_Task;
 
 
       -- control over stopping tasks
diff --git a/gcc/testsuite/ada/acats-4/tests/c9/c94001c.ada b/gcc/testsuite/ada/acats-4/tests/c9/c94001c.ada
index 0cc14f4..df38f99 100644
--- a/gcc/testsuite/ada/acats-4/tests/c9/c94001c.ada
+++ b/gcc/testsuite/ada/acats-4/tests/c9/c94001c.ada
@@ -211,7 +211,7 @@ BEGIN
 
      BEGIN -- (E)
           WHILE NOT(OUT_TSK'TERMINATED) AND DELAY_COUNT < 60 LOOP
-               DELAY 1.0;
+               DELAY 1.0 * Impdef.One_Long_Second;
                DELAY_COUNT := DELAY_COUNT + 1;
           END LOOP;
           IF DELAY_COUNT = 60 THEN
@@ -254,7 +254,7 @@ BEGIN
 
      BEGIN
           WHILE NOT(OUT_TSK'TERMINATED) AND DELAY_COUNT < 60 LOOP
-               DELAY 1.0;
+               DELAY 1.0 * Impdef.One_Long_Second;
                DELAY_COUNT := DELAY_COUNT + 1;
           END LOOP;
           IF DELAY_COUNT = 60 THEN
diff --git a/gcc/testsuite/ada/acats-4/tests/c9/c94006a.ada b/gcc/testsuite/ada/acats-4/tests/c9/c94006a.ada
index 6b9c85f..cac5fc6 100644
--- a/gcc/testsuite/ada/acats-4/tests/c9/c94006a.ada
+++ b/gcc/testsuite/ada/acats-4/tests/c9/c94006a.ada
@@ -28,6 +28,7 @@
 -- TBN  9/17/86
 -- PWN 01/31/95  REMOVED PRAGMA PRIORITY FOR ADA 9X.
 
+with Impdef;
 WITH REPORT; USE REPORT;
 WITH SYSTEM; USE SYSTEM;
 PROCEDURE C94006A IS
@@ -41,7 +42,7 @@ PROCEDURE C94006A IS
           SELECT
                ACCEPT E;
           OR
-               DELAY 30.0;
+               DELAY 30.0 * Impdef.One_Long_Second;
           END SELECT;
      END TT;
 
diff --git a/gcc/testsuite/ada/acats-4/tests/c9/c94008c.ada b/gcc/testsuite/ada/acats-4/tests/c9/c94008c.ada
index 6d10e25..fb2eee9 100644
--- a/gcc/testsuite/ada/acats-4/tests/c9/c94008c.ada
+++ b/gcc/testsuite/ada/acats-4/tests/c9/c94008c.ada
@@ -33,6 +33,7 @@
 -- JBG 8/29/86 ELIMINATED SHARED VARIABLES; ADDED GENERIC UNIT
 -- PWN 11/30/94 REMOVED PRAGMA PRIORITY INSTANCES FOR ADA 9X.
 
+with Impdef;
 WITH REPORT; USE REPORT;
 WITH SYSTEM; USE SYSTEM;
 PROCEDURE C94008C IS
@@ -198,10 +199,10 @@ BEGIN -- C94008C
                     OR WHEN ENTER_TERMINATE => TERMINATE;
                     END SELECT;
 
-                    DELAY 10.0;
+                    DELAY 10.0 * Impdef.One_Second;
 
                     IF TERMINATE_COUNT.GET /= 1 THEN
-                         DELAY 20.0;
+                         DELAY 20.0 * Impdef.One_Long_Second;
                     END IF;
 
                     IF TERMINATE_COUNT.GET /= 1 THEN
@@ -239,10 +240,10 @@ BEGIN -- C94008C
 
      BEGIN
 
-          DELAY 10.0; -- WAIT FOR T1, T2, AND T3 TO GET TO SELECT STMTS.
+          DELAY 10.0 * Impdef.One_Second; -- WAIT FOR T1, T2, AND T3 TO GET TO SELECT STMTS.
 
            IF TERMINATE_COUNT.GET /= 3 THEN
-                DELAY 20.0;
+                DELAY 20.0 * Impdef.One_Long_Second;
            END IF;
 
            IF TERMINATE_COUNT.GET /= 3 THEN
diff --git a/gcc/testsuite/ada/acats-4/tests/c9/c951002.a b/gcc/testsuite/ada/acats-4/tests/c9/c951002.a
index 8ccb2d0..65b696c 100644
--- a/gcc/testsuite/ada/acats-4/tests/c9/c951002.a
+++ b/gcc/testsuite/ada/acats-4/tests/c9/c951002.a
@@ -278,14 +278,14 @@ begin
       -- Wait until the message is queued on the entry before starting
       -- the Credit_Task
       while not Hold.TC_Message_is_Queued loop
-         delay ImpDef.Minimum_Task_Switch;   
+         delay ImpDef.Long_Minimum_Task_Switch;   
       end loop;
       --
       Credit_Task.TC_Start;
 
       -- Ensure the first part of the test is complete before continuing
       while not (Credit_Message'terminated and Credit_Task'terminated) loop
-         delay ImpDef.Minimum_Task_Switch;   
+         delay ImpDef.Long_Minimum_Task_Switch;   
       end loop;
 
       --======================================================
@@ -298,12 +298,12 @@ begin
       -- for it to reach the accept statement and call Hold.Set_DB_Overload
       -- before starting Debit_Message
       --
-      delay ImpDef.Switch_To_New_Task;
+      delay ImpDef.Long_Switch_To_New_Task;
 
       Debit_Message.TC_Start;
 
       while not Debit_Task'terminated loop
-         delay ImpDef.Minimum_Task_Switch;   
+         delay ImpDef.Long_Minimum_Task_Switch;   
       end loop;    
   
       Hold.Clear_DB_Overload;  -- Allow completion 
diff --git a/gcc/testsuite/ada/acats-4/tests/c9/c954a01.a b/gcc/testsuite/ada/acats-4/tests/c9/c954a01.a
index 34f48b2..3ea545a 100644
--- a/gcc/testsuite/ada/acats-4/tests/c9/c954a01.a
+++ b/gcc/testsuite/ada/acats-4/tests/c9/c954a01.a
@@ -148,7 +148,7 @@ package body C954A01_0 is  -- Printer server abstraction.
          end select;
 
          -- Allow other tasks to get control
-         delay ImpDef.Minimum_Task_Switch;
+         delay ImpDef.Long_Minimum_Task_Switch;
 
       end loop;
 
@@ -175,7 +175,7 @@ use  F954A00;
 
 procedure C954A01 is
 
-   Long_Enough : constant Duration := ImpDef.Switch_To_New_Task;
+   Long_Enough : constant Duration := ImpDef.Long_Switch_To_New_Task;
 
                --==============================================--
 
diff --git a/gcc/testsuite/ada/acats-4/tests/c9/c96001a.ada b/gcc/testsuite/ada/acats-4/tests/c9/c96001a.ada
index 74374b9..f958ea1 100644
--- a/gcc/testsuite/ada/acats-4/tests/c9/c96001a.ada
+++ b/gcc/testsuite/ada/acats-4/tests/c9/c96001a.ada
@@ -36,6 +36,7 @@
 --     RJW 11/13/87  ADDED CODE WHICH ALLOWS TEST TO REPORT "PASSED"
 --                   IF TICK > DURATION'SMALL.
 
+with Impdef;
 WITH CALENDAR;  USE CALENDAR;
 WITH SYSTEM;    USE SYSTEM;
 WITH REPORT;    USE REPORT;
@@ -50,7 +51,7 @@ BEGIN
      ---------------------------------------------
 
      DECLARE   -- (A)
-          X : DURATION := 5.0;
+          X : DURATION := 5.0 * Impdef.One_Second;
           OLD_TIME : TIME;
           LAPSE : DURATION;
      BEGIN     -- (A)
@@ -138,8 +139,8 @@ BEGIN
      ---------------------------------------------
 
      DECLARE   -- (E)
-          INC1 : DURATION := 2.0;
-          INC2 : DURATION := 3.0;
+          INC1 : DURATION := 2.0 * Impdef.One_Second;
+          INC2 : DURATION := 3.0 * Impdef.One_Second;
           OLD_TIME : TIME;
           LAPSE : DURATION;
      BEGIN     -- (E)
diff --git a/gcc/testsuite/g++.dg/cpp0x/range-for40.C b/gcc/testsuite/g++.dg/cpp0x/range-for40.C
new file mode 100644
index 0000000..dea4a2a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp0x/range-for40.C
@@ -0,0 +1,41 @@
+// PR c++/84009
+// { dg-do compile { target c++11 } }
+
+int z[64];
+
+void
+foo ()
+{
+  for (static auto a : z)		// { dg-error "for-range-declaration cannot be 'static'" }
+    ;
+  for (thread_local auto a : z)		// { dg-error "for-range-declaration cannot be 'thread_local'" }
+    ;
+  for (__thread auto a : z)		// { dg-error "for-range-declaration cannot be '__thread'" }
+    ;					// { dg-error "function-scope 'a' implicitly auto and declared '__thread'" "" { target *-*-* } .-1 }
+  for (register auto a : z)		// { dg-error "for-range-declaration cannot be 'register'" }
+    ;					// { dg-error "does not allow 'register' storage class specifier" "" { target c++17 } .-1 }
+  for (extern auto a : z)		// { dg-error "for-range-declaration cannot be 'extern'" }
+    ;					// { dg-error "'a' has both 'extern' and initializer" "" { target *-*-* } .-1 }
+  for (mutable auto a : z)		// { dg-error "non-member 'a' cannot be declared 'mutable'" }
+    ;
+  for (virtual auto a : z)		// { dg-error "'virtual' outside class declaration" }
+    ;
+  for (explicit auto a : z)		// { dg-error "'explicit' outside class declaration" }
+    ;
+  for (friend auto a : z)		// { dg-error "'friend' used outside of class" }
+    ;
+  for (typedef auto a : z)		// { dg-error "typedef declared 'auto'" }
+    ;					// { dg-error "typedef 'a' is initialized \\\(use 'decltype' instead\\\)" "" { target *-*-* } .-1 }
+#if __cplusplus >= 202002L
+  for (consteval auto a : z)		// { dg-error "a variable cannot be declared 'consteval'" "" { target c++20 } }
+    ;
+  for (constinit auto a : z)		// { dg-error "'constinit' can only be applied to a variable with static or thread storage duration" "" { target c++20 } }
+    ;
+#endif
+  for (inline auto a : z)		// { dg-error "'inline' specifier invalid for variable 'a' declared at block scope" }
+    ;
+  for (struct S { int a; } a : z)	// { dg-error "types may not be defined in a for-range-declaration" }
+    ;					// { dg-error "conversion from 'int' to non-scalar type 'foo\\\(\\\)::S' requested" "" { target *-*-* } .-1 }
+  for (enum E { E0 } a : z)		// { dg-error "types may not be defined in a for-range-declaration" }
+    ;					// { dg-error "invalid conversion from 'int' to 'foo\\\(\\\)::E'" "" { target *-*-* } .-1 }
+}
diff --git a/gcc/testsuite/g++.dg/cpp0x/range-for41.C b/gcc/testsuite/g++.dg/cpp0x/range-for41.C
new file mode 100644
index 0000000..d690365
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp0x/range-for41.C
@@ -0,0 +1,42 @@
+// PR c++/84009
+// { dg-do compile { target c++11 } }
+// { dg-options "" }
+
+int z[64];
+
+void
+foo ()
+{
+  for (static auto a : z)		// { dg-warning "for-range-declaration cannot be 'static'" }
+    ;
+  for (thread_local auto a : z)		// { dg-warning "for-range-declaration cannot be 'thread_local'" }
+    ;
+  for (__thread auto a : z)		// { dg-warning "for-range-declaration cannot be '__thread'" }
+    ;					// { dg-warning "function-scope 'a' implicitly auto and declared '__thread'" "" { target *-*-* } .-1 }
+  for (register auto a : z)		// { dg-warning "for-range-declaration cannot be 'register'" }
+    ;					// { dg-warning "does not allow 'register' storage class specifier" "" { target c++17 } .-1 }
+  for (extern auto a : z)		// { dg-warning "for-range-declaration cannot be 'extern'" }
+    ;					// { dg-error "'a' has both 'extern' and initializer" "" { target *-*-* } .-1 }
+  for (mutable auto a : z)		// { dg-error "non-member 'a' cannot be declared 'mutable'" }
+    ;
+  for (virtual auto a : z)		// { dg-error "'virtual' outside class declaration" }
+    ;
+  for (explicit auto a : z)		// { dg-error "'explicit' outside class declaration" }
+    ;
+  for (friend auto a : z)		// { dg-error "'friend' used outside of class" }
+    ;
+  for (typedef auto a : z)		// { dg-error "typedef declared 'auto'" }
+    ;					// { dg-error "typedef 'a' is initialized \\\(use 'decltype' instead\\\)" "" { target *-*-* } .-1 }
+#if __cplusplus >= 202002L
+  for (consteval auto a : z)		// { dg-error "a variable cannot be declared 'consteval'" "" { target c++20 } }
+    ;
+  for (constinit auto a : z)		// { dg-error "'constinit' can only be applied to a variable with static or thread storage duration" "" { target c++20 } }
+    ;
+#endif
+  for (inline auto a : z)		// { dg-error "'inline' specifier invalid for variable 'a' declared at block scope" }
+    ;
+  for (struct S { int a; } a : z)	// { dg-error "types may not be defined in a for-range-declaration" }
+    ;					// { dg-error "conversion from 'int' to non-scalar type 'foo\\\(\\\)::S' requested" "" { target *-*-* } .-1 }
+  for (enum E { E0 } a : z)		// { dg-error "types may not be defined in a for-range-declaration" }
+    ;					// { dg-error "invalid conversion from 'int' to 'foo\\\(\\\)::E'" "" { target *-*-* } .-1 }
+}
diff --git a/gcc/testsuite/g++.dg/cpp0x/range-for42.C b/gcc/testsuite/g++.dg/cpp0x/range-for42.C
new file mode 100644
index 0000000..a5d94fc
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp0x/range-for42.C
@@ -0,0 +1,41 @@
+// PR c++/84009
+// { dg-do compile { target c++11 } }
+
+struct S { int y; } z[64];
+
+void
+foo ()
+{
+  for (static auto [ a ] : z)		// { dg-error "for-range-declaration cannot be 'static'" }
+    ;					// { dg-error "structured binding declaration can be 'static' only in" "" { target c++17_down } .-1 }
+					// { dg-error "structured bindings only available with" "" { target c++14_down } .-2 }
+  for (thread_local auto [ a ] : z)	// { dg-error "for-range-declaration cannot be 'thread_local'" }
+    ;					// { dg-error "structured binding declaration can be 'thread_local' only in" "" { target c++17_down } .-1 }
+					// { dg-error "structured bindings only available with" "" { target c++14_down } .-2 }
+  for (__thread auto [ a ] : z)		// { dg-error "for-range-declaration cannot be '__thread'" }
+    ;					// { dg-error "function-scope 'structured binding' implicitly auto and declared '__thread'" "" { target *-*-* } .-1 }
+					// { dg-error "structured binding declaration can be '__thread' only in" "" { target c++17_down } .-2 }
+					// { dg-error "structured bindings only available with" "" { target c++14_down } .-3 }
+  for (register auto [ a ] : z)		// { dg-error "structured binding declaration cannot be 'register'" }
+    ;					// { dg-error "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (extern auto [ a ] : z)		// { dg-error "structured binding declaration cannot be 'extern'" }
+    ;					// { dg-error "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (mutable auto [ a ] : z)		// { dg-error "structured binding declaration cannot be 'mutable'" }
+    ;					// { dg-error "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (virtual auto [ a ] : z)		// { dg-error "'virtual' outside class declaration" }
+    ;					// { dg-error "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (explicit auto [ a ] : z)		// { dg-error "'explicit' outside class declaration" }
+    ;					// { dg-error "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (friend auto [ a ] : z)		// { dg-error "'friend' used outside of class" }
+    ;					// { dg-error "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (typedef auto [ a ] : z)		// { dg-error "structured binding declaration cannot be 'typedef'" }
+    ;					// { dg-error "structured bindings only available with" "" { target c++14_down } .-1 }
+#if __cplusplus >= 202002L
+  for (consteval auto [ a ] : z)	// { dg-error "structured binding declaration cannot be 'consteval'" "" { target c++20 } }
+    ;
+  for (constinit auto [ a ] : z)	// { dg-error "'constinit' can only be applied to a variable with static or thread storage duration" "" { target c++20 } }
+    ;
+#endif
+  for (inline auto [ a ] : z)		// { dg-error "structured binding declaration cannot be 'inline'" }
+    ;					// { dg-error "structured bindings only available with" "" { target c++14_down } .-1 }
+}
diff --git a/gcc/testsuite/g++.dg/cpp0x/range-for43.C b/gcc/testsuite/g++.dg/cpp0x/range-for43.C
new file mode 100644
index 0000000..77060e3
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp0x/range-for43.C
@@ -0,0 +1,42 @@
+// PR c++/84009
+// { dg-do compile { target c++11 } }
+// { dg-options "" }
+
+struct S { int y; } z[64];
+
+void
+foo ()
+{
+  for (static auto [ a ] : z)		// { dg-warning "for-range-declaration cannot be 'static'" }
+    ;					// { dg-warning "structured binding declaration can be 'static' only in" "" { target c++17_down } .-1 }
+					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-2 }
+  for (thread_local auto [ a ] : z)	// { dg-warning "for-range-declaration cannot be 'thread_local'" }
+    ;					// { dg-warning "structured binding declaration can be 'thread_local' only in" "" { target c++17_down } .-1 }
+					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-2 }
+  for (__thread auto [ a ] : z)		// { dg-warning "for-range-declaration cannot be '__thread'" }
+    ;					// { dg-warning "function-scope 'structured binding' implicitly auto and declared '__thread'" "" { target *-*-* } .-1 }
+ 					// { dg-warning "structured binding declaration can be '__thread' only in" "" { target c++17_down } .-2 }
+					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-3 }
+  for (register auto [ a ] : z)		// { dg-error "structured binding declaration cannot be 'register'" }
+    ;					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (extern auto [ a ] : z)		// { dg-error "structured binding declaration cannot be 'extern'" }
+    ;					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (mutable auto [ a ] : z)		// { dg-error "structured binding declaration cannot be 'mutable'" }
+    ;					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (virtual auto [ a ] : z)		// { dg-error "'virtual' outside class declaration" }
+    ;					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (explicit auto [ a ] : z)		// { dg-error "'explicit' outside class declaration" }
+    ;					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (friend auto [ a ] : z)		// { dg-error "'friend' used outside of class" }
+    ;					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-1 }
+  for (typedef auto [ a ] : z)		// { dg-error "structured binding declaration cannot be 'typedef'" }
+    ;					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-1 }
+#if __cplusplus >= 202002L
+  for (consteval auto [ a ] : z)	// { dg-error "structured binding declaration cannot be 'consteval'" "" { target c++20 } }
+    ;
+  for (constinit auto [ a ] : z)	// { dg-error "'constinit' can only be applied to a variable with static or thread storage duration" "" { target c++20 } }
+    ;
+#endif
+  for (inline auto [ a ] : z)		// { dg-error "structured binding declaration cannot be 'inline'" }
+    ;					// { dg-warning "structured bindings only available with" "" { target c++14_down } .-1 }
+}
diff --git a/gcc/testsuite/gcc.dg/crc-non-cst-poly-1.c b/gcc/testsuite/gcc.dg/crc-non-cst-poly-1.c
new file mode 100644
index 0000000..0c3d905
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/crc-non-cst-poly-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "" } */
+
+/* PR middle-end/120709 */
+/* Make sure we don't ICE on a non-constant poly argument. */
+
+
+typedef unsigned char uint8_t;
+uint8_t crc8_data8(uint8_t crc, uint8_t data, uint8_t polynomial) {
+  return __builtin_rev_crc32_data8 (crc, data, polynomial); /* { dg-error "must be a constant" } */
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr120951-1.c b/gcc/testsuite/gcc.dg/torture/pr120951-1.c
new file mode 100644
index 0000000..4e2b41d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr120951-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-fnon-call-exceptions -fsignaling-nans" } */
+
+/* PR tree-optimization/120951 */
+
+/* cdce would create a trapping comparison inside a condition.
+   tests to make sure that does not happen.  */
+
+double f(double r, double i) {
+   return __builtin_fmod(r, i);
+}
+
diff --git a/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c b/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c
index 43a9090..904dd0c 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c
@@ -31,4 +31,4 @@ int main()
 }
 /* We will have profiles for test2 and test2.constprop.0 that will have to be
    merged,  */
-/* { dg-final-use-autofdo { scan-ipa-dump "Merging duplicate symbol test2" "afdo_offline"} } */
+/* { dg-final-use-autofdo { scan-ipa-dump "Merging duplicate instance: test2" "afdo_offline"} } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr120817.c b/gcc/testsuite/gcc.dg/vect/pr120817.c
new file mode 100644
index 0000000..d8f55c9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr120817.c
@@ -0,0 +1,40 @@
+/* { dg-additional-options "-O1" } */
+/* { dg-additional-options "-mcpu=neoverse-n2" { target aarch64*-*-* } } */
+
+#include "tree-vect.h"
+
+typedef struct {
+    int _M_current;
+} __normal_iterator;
+
+typedef struct {
+    char _M_elems[5];
+} array_5;
+
+__normal_iterator __trans_tmp_1 = {-5};
+
+__attribute__((noipa))
+array_5 copySourceIntoTarget() {
+    array_5 target;
+    char* target_it = target._M_elems;
+
+    while (__trans_tmp_1._M_current != 0) {
+        *target_it = 1;
+        __trans_tmp_1._M_current++;
+        target_it++;
+    }
+
+    return target;
+}
+
+int main ()
+{
+  check_vect ();
+
+  array_5 res = copySourceIntoTarget();
+
+#pragma GCC novector
+  for (int i = 0; i < 5; i++)
+    if (res._M_elems[i] != 1)
+      __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
new file mode 100644
index 0000000..bf9c127
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
@@ -0,0 +1,602 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 --param=aarch64-autovec-preference=sve-only -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint.h>
+
+#define UNLT(A, B) (!__builtin_isgreaterequal (A, B))
+#define UNLE(A, B) (!__builtin_isgreater (A, B))
+#define UNGT(A, B) (!__builtin_islessequal (A, B))
+#define UNGE(A, B) (!__builtin_isless (A, B))
+#define UNEQ(A, B) (!__builtin_islessgreater (A, B))
+
+#define EQ(A, B) ((A) == (B))
+#define NE(A, B) ((A) != (B))
+#define LE(A, B) ((A) <= (B))
+#define LT(A, B) ((A) < (B))
+#define GE(A, B) ((A) >= (B))
+#define GT(A, B) ((A) > (B))
+#define ORDERED(A, B) (!__builtin_isunordered (A, B))
+#define UNORDERED(A, B) (__builtin_isunordered (A, B))
+
+#define b_i b[i]
+
+#define TEST_FCM(TYPE0, TYPE1, CMP, RHS, COUNT)			\
+  void								\
+  f_##TYPE0##_##TYPE1##_##CMP##_##RHS (TYPE0 *__restrict out,	\
+				       TYPE1 *__restrict a,	\
+				       TYPE1 *__restrict b)	\
+  {								\
+    for (unsigned int i = 0; i < COUNT; i++)			\
+      out[i] = CMP (a[i], RHS) ? 3 : out[i];			\
+  }
+
+#define TEST_CC_REG(CMP)		      \
+  TEST_FCM (uint64_t, float, CMP, b_i, 32)    \
+  TEST_FCM (uint32_t, _Float16, CMP, b_i, 64) \
+  TEST_FCM (uint64_t, _Float16, CMP, b_i, 32)
+
+#define TEST_CC_ALL(CMP)		    \
+  TEST_CC_REG (CMP)			    \
+  TEST_FCM (uint64_t, float, CMP, 0, 32)    \
+  TEST_FCM (uint32_t, _Float16, CMP, 0, 64) \
+  TEST_FCM (uint64_t, _Float16, CMP, 0, 32)
+
+
+/*
+** f_uint64_t_float_UNLT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	(p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmge	p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNLT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmuo	(p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmge	p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNLT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	(p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmge	p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNLT)
+
+/*
+** f_uint64_t_float_UNLE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	(p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmgt	p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNLE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmuo	(p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmgt	p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNLE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	(p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmgt	p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNLE)
+
+/*
+** f_uint64_t_float_UNGT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	(p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmle	p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNGT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmuo	(p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmle	p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNGT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	(p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmle	p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNGT)
+
+/*
+** f_uint64_t_float_UNGE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	(p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmlt	p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNGE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmuo	(p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmlt	p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNGE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	(p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmlt	p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNGE)
+
+/*
+** f_uint64_t_float_UNEQ_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	(p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmne	p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNEQ_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmuo	(p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmne	p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNEQ_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	(p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**	not	(p[0-9]+)\.b, \1/z, \2\.b
+**	fcmne	p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNEQ)
+
+/*
+** f_uint64_t_float_EQ_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmeq	p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_EQ_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmeq	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_EQ_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmeq	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_EQ_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmeq	p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_EQ_0:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmeq	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_EQ_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmeq	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (EQ)
+
+/*
+** f_uint64_t_float_NE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmne	p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_NE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmne	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_NE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmne	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_NE_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmne	p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_NE_0:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmne	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_NE_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmne	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (NE)
+
+/*
+** f_uint64_t_float_LE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmle	p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmle	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmle	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_LE_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmle	p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LE_0:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmle	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LE_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmle	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (LE)
+
+/*
+** f_uint64_t_float_LT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmlt	p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmlt	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmlt	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_LT_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmlt	p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LT_0:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmlt	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LT_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmlt	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (LT)
+
+/*
+** f_uint64_t_float_GE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmge	p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmge	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GE_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmge	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_GE_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmge	p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GE_0:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmge	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GE_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmge	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (GE)
+
+/*
+** f_uint64_t_float_GT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmgt	p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmgt	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GT_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmgt	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_GT_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmgt	p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GT_0:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmgt	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GT_0:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmgt	p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (GT)
+
+/*
+** f_uint64_t_float_ORDERED_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_ORDERED_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmuo	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_ORDERED_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (ORDERED)
+
+/*
+** f_uint64_t_float_UNORDERED_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNORDERED_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.s, all
+**  ...
+**	fcmuo	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNORDERED_b_i:
+**  ...
+**	ptrue	(p[0-9]+)\.d, all
+**  ...
+**	fcmuo	p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNORDERED)
+
+
+/* { dg-final { check-function-bodies "**" "" ""} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c
new file mode 100644
index 0000000..ab210da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 --param=aarch64-autovec-preference=sve-only -fno-trapping-math" } */
+
+#include "unpacked_fcm_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 32 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 32 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 32 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
diff --git a/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c b/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c
index 102217c..4f26aa4 100644
--- a/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c
+++ b/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c
@@ -8,17 +8,28 @@
 /*
 **foo:
 **...
+**	leaq	-160\(%rbp\), %rax
+**	movq	%rax, %rcx
 **	pxor	%xmm0, %xmm0
-**...
+**	movl	\$160, %edx
+**	movl	%edx, %edi
+**	andl	\$-64, %edi
+**	movl	\$0, %esi
 **.L[0-9]+:
-**	movl	%esi, %ecx
-**	movaps	%xmm0, \(%rdx,%rcx\)
-**	movaps	%xmm0, 16\(%rdx,%rcx\)
-**	movaps	%xmm0, 32\(%rdx,%rcx\)
-**	movaps	%xmm0, 48\(%rdx,%rcx\)
+**	movl	%esi, %edx
+**	movaps	%xmm0, \(%rax,%rdx\)
+**	movaps	%xmm0, 16\(%rax,%rdx\)
+**	movaps	%xmm0, 32\(%rax,%rdx\)
+**	movaps	%xmm0, 48\(%rax,%rdx\)
 **	addl	\$64, %esi
 **	cmpl	%edi, %esi
 **	jb	.L[0-9]+
+**	movl	%esi, %eax
+**	addq	%rax, %rcx
+**	movaps	%xmm0, \(%rcx\)
+**	movaps	%xmm0, 16\(%rcx\)
+**	movzbl	-116\(%rbp\), %eax
+**	movsbl	%al, %eax
 **...
 */
 
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c
new file mode 100644
index 0000000..753238e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemcpy-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movq	221\(%rsi\), %rax
+**	xorl	%edx, %edx
+**	movq	%rax, 221\(%rdi\)
+**	movq	229\(%rsi\), %rax
+**	movq	%rax, 229\(%rdi\)
+**	movq	237\(%rsi\), %rax
+**	movq	%rax, 237\(%rdi\)
+**	movq	245\(%rsi\), %rax
+**	movq	%rax, 245\(%rdi\)
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	addl	\$32, %edx
+**	movq	\(%rsi,%rax\), %r10
+**	movq	8\(%rsi,%rax\), %r9
+**	movq	16\(%rsi,%rax\), %r8
+**	movq	24\(%rsi,%rax\), %rcx
+**	movq	%r10, \(%rdi,%rax\)
+**	movq	%r9, 8\(%rdi,%rax\)
+**	movq	%r8, 16\(%rdi,%rax\)
+**	movq	%rcx, 24\(%rdi,%rax\)
+**	cmpl	\$224, %edx
+**	jb	.L[0-9]+
+**	ret
+**...
+*/
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 253);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c
new file mode 100644
index 0000000..9b0fb06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	addl	\$64, %edx
+**	movdqa	src\(%rax\), %xmm3
+**	movdqa	src\+16\(%rax\), %xmm2
+**	movdqa	src\+32\(%rax\), %xmm1
+**	movdqa	src\+48\(%rax\), %xmm0
+**	movaps	%xmm3, dest\(%rax\)
+**	movaps	%xmm2, dest\+16\(%rax\)
+**	movaps	%xmm1, dest\+32\(%rax\)
+**	movaps	%xmm0, dest\+48\(%rax\)
+**	cmpl	\$256, %edx
+**	jb	.L[0-9]+
+**	movdqa	src\(%rdx\), %xmm0
+**	movaps	%xmm0, dest\(%rdx\)
+**	ret
+**...
+*/
+
+#define SIZE (16 + 1) * 16
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c
new file mode 100644
index 0000000..600459b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	addl	\$64, %edx
+**	movdqa	src\(%rax\), %xmm3
+**	movdqa	src\+16\(%rax\), %xmm2
+**	movdqa	src\+32\(%rax\), %xmm1
+**	movdqa	src\+48\(%rax\), %xmm0
+**	movaps	%xmm3, dest\(%rax\)
+**	movaps	%xmm2, dest\+16\(%rax\)
+**	movaps	%xmm1, dest\+32\(%rax\)
+**	movaps	%xmm0, dest\+48\(%rax\)
+**	cmpl	\$256, %edx
+**	jb	.L[0-9]+
+**	movdqa	src\(%rdx\), %xmm0
+**	movaps	%xmm0, dest\(%rdx\)
+**	movdqu	src\+15\(%rdx\), %xmm0
+**	movups	%xmm0, dest\+15\(%rdx\)
+**	ret
+**...
+*/
+
+#define SIZE 16 * 16 + 31
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c
new file mode 100644
index 0000000..14833ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	subl	\$-128, %edx
+**	vmovdqa	src\(%rax\), %ymm3
+**	vmovdqa	src\+32\(%rax\), %ymm2
+**	vmovdqa	src\+64\(%rax\), %ymm1
+**	vmovdqa	src\+96\(%rax\), %ymm0
+**	vmovdqa	%ymm3, dest\(%rax\)
+**	vmovdqa	%ymm2, dest\+32\(%rax\)
+**	vmovdqa	%ymm1, dest\+64\(%rax\)
+**	vmovdqa	%ymm0, dest\+96\(%rax\)
+**	cmpl	\$512, %edx
+**	jb	.L[0-9]+
+**	vmovdqa	src\(%rdx\), %ymm0
+**	vmovdqa	%ymm0, dest\(%rdx\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+#define SIZE (16 + 1) * 32
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c
new file mode 100644
index 0000000..15ffed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	subl	\$-128, %edx
+**	vmovdqa	src\(%rax\), %ymm3
+**	vmovdqa	src\+32\(%rax\), %ymm2
+**	vmovdqa	src\+64\(%rax\), %ymm1
+**	vmovdqa	src\+96\(%rax\), %ymm0
+**	vmovdqa	%ymm3, dest\(%rax\)
+**	vmovdqa	%ymm2, dest\+32\(%rax\)
+**	vmovdqa	%ymm1, dest\+64\(%rax\)
+**	vmovdqa	%ymm0, dest\+96\(%rax\)
+**	cmpl	\$512, %edx
+**	jb	.L[0-9]+
+**	vmovdqa	src\(%rdx\), %ymm0
+**	vmovdqa	%ymm0, dest\(%rdx\)
+**	vmovdqu	src\+31\(%rdx\), %ymm0
+**	vmovdqu	%ymm0, dest\+31\(%rdx\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+#define SIZE 16 * 32 + 32 + 31
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c
new file mode 100644
index 0000000..d57dcc1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	addl	\$256, %edx
+**	vmovdqa64	src\(%rax\), %zmm3
+**	vmovdqa64	src\+64\(%rax\), %zmm2
+**	vmovdqa64	src\+128\(%rax\), %zmm1
+**	vmovdqa64	src\+192\(%rax\), %zmm0
+**	vmovdqa64	%zmm3, dest\(%rax\)
+**	vmovdqa64	%zmm2, dest\+64\(%rax\)
+**	vmovdqa64	%zmm1, dest\+128\(%rax\)
+**	vmovdqa64	%zmm0, dest\+192\(%rax\)
+**	cmpl	\$1024, %edx
+**	jb	.L[0-9]+
+**	vmovdqa64	src\(%rdx\), %zmm0
+**	vmovdqa64	%zmm0, dest\(%rdx\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+#define SIZE (16 + 1) * 64
+
+char dest[SIZE] __attribute__((aligned(64)));
+char src[SIZE] __attribute__((aligned(64)));
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c
new file mode 100644
index 0000000..d9eb77d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	addl	\$256, %edx
+**	vmovdqa64	src\(%rax\), %zmm3
+**	vmovdqa64	src\+64\(%rax\), %zmm2
+**	vmovdqa64	src\+128\(%rax\), %zmm1
+**	vmovdqa64	src\+192\(%rax\), %zmm0
+**	vmovdqa64	%zmm3, dest\(%rax\)
+**	vmovdqa64	%zmm2, dest\+64\(%rax\)
+**	vmovdqa64	%zmm1, dest\+128\(%rax\)
+**	vmovdqa64	%zmm0, dest\+192\(%rax\)
+**	cmpl	\$1024, %edx
+**	jb	.L[0-9]+
+**	vmovdqa	src\(%rdx\), %ymm0
+**	vmovdqa	%ymm0, dest\(%rdx\)
+**	vmovdqu	src\+31\(%rdx\), %ymm0
+**	vmovdqu	%ymm0, dest\+31\(%rdx\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+#define SIZE 16 * 64 + 63
+
+char dest[SIZE] __attribute__((aligned(64)));
+char src[SIZE] __attribute__((aligned(64)));
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
index d0316ef..4716086 100644
--- a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
@@ -6,9 +6,16 @@
 /*
 **foo:
 **.LFB[0-9]+:
-**...
+**	.cfi_startproc
+**	movq	221\(%rsi\), %rax
 **	xorl	%edx, %edx
-**...
+**	movq	%rax, 221\(%rdi\)
+**	movq	229\(%rsi\), %rax
+**	movq	%rax, 229\(%rdi\)
+**	movq	237\(%rsi\), %rax
+**	movq	%rax, 237\(%rdi\)
+**	movq	245\(%rsi\), %rax
+**	movq	%rax, 245\(%rdi\)
 **.L[0-9]+:
 **	movl	%edx, %eax
 **	addl	\$32, %edx
@@ -22,6 +29,7 @@
 **	movq	%rcx, 24\(%rdi,%rax\)
 **	cmpl	\$224, %edx
 **	jb	.L[0-9]+
+**	ret
 **...
 */
 
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c
new file mode 100644
index 0000000..90e544d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	xorl	%eax, %eax
+**	movups	%xmm0, 190\(%rdi\)
+**	movups	%xmm0, 206\(%rdi\)
+**	movups	%xmm0, 222\(%rdi\)
+**	movups	%xmm0, 238\(%rdi\)
+**.L[0-9]+:
+**	movl	%eax, %edx
+**	addl	\$64, %eax
+**	movups	%xmm0, \(%rdi,%rdx\)
+**	movups	%xmm0, 16\(%rdi,%rdx\)
+**	movups	%xmm0, 32\(%rdi,%rdx\)
+**	movups	%xmm0, 48\(%rdi,%rdx\)
+**	cmpl	\$192, %eax
+**	jb	.L[0-9]+
+**	ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c
new file mode 100644
index 0000000..6d3d9e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movq	\$0, 48\(%rdi\)
+**	movq	\$0, \(%rdi\)
+**	movq	\$0, 8\(%rdi\)
+**	movq	\$0, 16\(%rdi\)
+**	movq	\$0, 24\(%rdi\)
+**	movq	\$0, 32\(%rdi\)
+**	movq	\$0, 40\(%rdi\)
+**	movq	\$0, 53\(%rdi\)
+**	ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c
new file mode 100644
index 0000000..30b0cad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movabsq	\$289360691352306692, %rax
+**	movq	%rax, 48\(%rdi\)
+**	movq	%rax, \(%rdi\)
+**	movq	%rax, 8\(%rdi\)
+**	movq	%rax, 16\(%rdi\)
+**	movq	%rax, 24\(%rdi\)
+**	movq	%rax, 32\(%rdi\)
+**	movq	%rax, 40\(%rdi\)
+**	movq	%rax, 53\(%rdi\)
+**	ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 4, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c
new file mode 100644
index 0000000..15987a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movabsq	\$72340172838076673, %rax
+**	movzbl	%sil, %esi
+**	imulq	%rax, %rsi
+**	movq	%rsi, 48\(%rdi\)
+**	movq	%rsi, \(%rdi\)
+**	movq	%rsi, 8\(%rdi\)
+**	movq	%rsi, 16\(%rdi\)
+**	movq	%rsi, 24\(%rdi\)
+**	movq	%rsi, 32\(%rdi\)
+**	movq	%rsi, 40\(%rdi\)
+**	movq	%rsi, 53\(%rdi\)
+**	ret
+**...
+*/
+
+void
+foo (char *dest, int c)
+{
+  __builtin_memset (dest, c, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c
new file mode 100644
index 0000000..3da6ca7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	xorl	%eax, %eax
+**.L[0-9]+:
+**	movl	%eax, %edx
+**	addl	\$64, %eax
+**	movaps	%xmm0, dest\(%rdx\)
+**	movaps	%xmm0, dest\+16\(%rdx\)
+**	movaps	%xmm0, dest\+32\(%rdx\)
+**	movaps	%xmm0, dest\+48\(%rdx\)
+**	cmpl	\$192, %eax
+**	jb	.L[0-9]+
+**	movaps	%xmm0, dest\(%rax\)
+**	movaps	%xmm0, dest\+16\(%rax\)
+**	movaps	%xmm0, dest\+32\(%rax\)
+**	ret
+**...
+*/
+
+char dest[240];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c
new file mode 100644
index 0000000..7ec9b3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c
@@ -0,0 +1,91 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	cmpq	\$64, %rsi
+**	jnb	.L2
+**	testb	\$32, %sil
+**	jne	.L19
+**	testb	\$16, %sil
+**	jne	.L20
+**	testb	\$8, %sil
+**	jne	.L21
+**	testb	\$4, %sil
+**	jne	.L22
+**	testq	%rsi, %rsi
+**	jne	.L23
+**.L1:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	movups	%xmm0, -64\(%rdi,%rsi\)
+**	movups	%xmm0, -48\(%rdi,%rsi\)
+**	movups	%xmm0, -32\(%rdi,%rsi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$64, %rsi
+**	jb	.L1
+**	andq	\$-64, %rsi
+**	xorl	%eax, %eax
+**.L9:
+**	movups	%xmm0, \(%rdi,%rax\)
+**	movups	%xmm0, 16\(%rdi,%rax\)
+**	movups	%xmm0, 32\(%rdi,%rax\)
+**	movups	%xmm0, 48\(%rdi,%rax\)
+**	addq	\$64, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L9
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	movb	\$0, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L1
+**	xorl	%eax, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L19:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, 16\(%rdi\)
+**	movups	%xmm0, -32\(%rdi,%rsi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L20:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L21:
+**	movq	\$0, \(%rdi\)
+**	movq	\$0, -8\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	movl	\$0, \(%rdi\)
+**	movl	\$0, -4\(%rdi,%rsi\)
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c
new file mode 100644
index 0000000..e754405
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	vpxor	%xmm0, %xmm0, %xmm0
+**	cmpq	\$128, %rsi
+**	jnb	.L2
+**	testb	\$64, %sil
+**	jne	.L22
+**	testb	\$32, %sil
+**	jne	.L23
+**	testb	\$16, %sil
+**	jne	.L24
+**	testb	\$8, %sil
+**	jne	.L25
+**	testb	\$4, %sil
+**	jne	.L26
+**	testq	%rsi, %rsi
+**	jne	.L27
+**.L20:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu	%ymm0, -128\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -96\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$128, %rsi
+**	jb	.L19
+**	andq	\$-128, %rsi
+**	xorl	%eax, %eax
+**.L10:
+**	vmovdqu	%ymm0, \(%rdi,%rax\)
+**	vmovdqu	%ymm0, 32\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 64\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 96\(%rdi,%rax\)
+**	subq	\$-128, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L10
+**.L19:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L27:
+**	movb	\$0, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L20
+**	xorl	%eax, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, 32\(%rdi\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	movq	\$0, \(%rdi\)
+**	movq	\$0, -8\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	movl	\$0, \(%rdi\)
+**	movl	\$0, -4\(%rdi,%rsi\)
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c
new file mode 100644
index 0000000..c519bf3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c
@@ -0,0 +1,112 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	vpxor	%xmm0, %xmm0, %xmm0
+**	cmpq	\$256, %rsi
+**	jnb	.L2
+**	testb	\$-128, %sil
+**	jne	.L23
+**	testb	\$64, %sil
+**	jne	.L24
+**	testb	\$32, %sil
+**	jne	.L25
+**	testb	\$16, %sil
+**	jne	.L26
+**	testb	\$8, %sil
+**	jne	.L27
+**	testb	\$4, %sil
+**	jne	.L28
+**	testq	%rsi, %rsi
+**	jne	.L29
+**.L21:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu64	%zmm0, -256\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -192\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$256, %rsi
+**	jb	.L20
+**	xorb	%sil, %sil
+**	xorl	%eax, %eax
+**.L11:
+**	vmovdqu64	%zmm0, \(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 64\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 128\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 192\(%rdi,%rax\)
+**	addq	\$256, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L11
+**.L20:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L29:
+**	movb	\$0, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L21
+**	xorl	%eax, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, 64\(%rdi\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L27:
+**	movq	\$0, \(%rdi\)
+**	movq	\$0, -8\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L28:
+**	movl	\$0, \(%rdi\)
+**	movl	\$0, -4\(%rdi,%rsi\)
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c
new file mode 100644
index 0000000..744184c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	xorl	%eax, %eax
+**.L[0-9]+:
+**	movl	%eax, %edx
+**	addl	\$64, %eax
+**	movaps	%xmm0, dest\(%rdx\)
+**	movaps	%xmm0, dest\+16\(%rdx\)
+**	movaps	%xmm0, dest\+32\(%rdx\)
+**	movaps	%xmm0, dest\+48\(%rdx\)
+**	cmpl	\$128, %eax
+**	jb	.L[0-9]+
+**	movq	\$0, dest\+48\(%rax\)
+**	movaps	%xmm0, dest\(%rax\)
+**	movaps	%xmm0, dest\+16\(%rax\)
+**	movaps	%xmm0, dest\+32\(%rax\)
+**	ret
+**...
+*/
+
+char dest[184];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-18.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-18.c
new file mode 100644
index 0000000..32f8981
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-18.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	xorl	%eax, %eax
+**.L[0-9]+:
+**	movl	%eax, %edx
+**	addl	\$64, %eax
+**	movaps	%xmm0, dest\(%rdx\)
+**	movaps	%xmm0, dest\+16\(%rdx\)
+**	movaps	%xmm0, dest\+32\(%rdx\)
+**	movaps	%xmm0, dest\+48\(%rdx\)
+**	cmpl	\$128, %eax
+**	jb	.L[0-9]+
+**	movaps	%xmm0, dest\+32\(%rax\)
+**	movaps	%xmm0, dest\(%rax\)
+**	movl	\$0, dest\+47\(%rax\)
+**	movaps	%xmm0, dest\+16\(%rax\)
+**	ret
+**...
+*/
+
+char dest[179];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-19.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-19.c
new file mode 100644
index 0000000..04f9171
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-19.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	xorl	%eax, %eax
+**.L[0-9]+:
+**	movl	%eax, %edx
+**	addl	\$64, %eax
+**	movaps	%xmm0, dest\(%rdx\)
+**	movaps	%xmm0, dest\+16\(%rdx\)
+**	movaps	%xmm0, dest\+32\(%rdx\)
+**	movaps	%xmm0, dest\+48\(%rdx\)
+**	cmpl	\$128, %eax
+**	jb	.L[0-9]+
+**	movb	\$0, dest\+48\(%rax\)
+**	movaps	%xmm0, dest\(%rax\)
+**	movaps	%xmm0, dest\+16\(%rax\)
+**	movaps	%xmm0, dest\+32\(%rax\)
+**	ret
+**...
+*/
+
+char dest[177];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c
new file mode 100644
index 0000000..f7834c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	vpxor	%xmm0, %xmm0, %xmm0
+**	vmovdqu	%ymm0, 192\(%rdi\)
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, 32\(%rdi\)
+**	vmovdqu	%ymm0, 64\(%rdi\)
+**	vmovdqu	%ymm0, 96\(%rdi\)
+**	vmovdqu	%ymm0, 128\(%rdi\)
+**	vmovdqu	%ymm0, 160\(%rdi\)
+**	vmovdqu	%ymm0, 222\(%rdi\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-20.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-20.c
new file mode 100644
index 0000000..edece12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-20.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movd	%edi, %xmm0
+**	punpcklbw	%xmm0, %xmm0
+**	punpcklwd	%xmm0, %xmm0
+**	pshufd	\$0, %xmm0, %xmm0
+**	movaps	%xmm0, dest\+160\(%rip\)
+**	movaps	%xmm0, dest\(%rip\)
+**	movaps	%xmm0, dest\+16\(%rip\)
+**	movaps	%xmm0, dest\+32\(%rip\)
+**	movaps	%xmm0, dest\+48\(%rip\)
+**	movaps	%xmm0, dest\+64\(%rip\)
+**	movaps	%xmm0, dest\+80\(%rip\)
+**	movaps	%xmm0, dest\+96\(%rip\)
+**	movaps	%xmm0, dest\+112\(%rip\)
+**	movaps	%xmm0, dest\+128\(%rip\)
+**	movaps	%xmm0, dest\+144\(%rip\)
+**	movd	%xmm0, dest\+175\(%rip\)
+**	ret
+**...
+*/
+
+char dest[179];
+
+void
+foo (int c)
+{
+  __builtin_memset (dest, c, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-21.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-21.c
new file mode 100644
index 0000000..a88e109
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-21.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movd	%edi, %xmm0
+**	movb	%dil, dest\+176\(%rip\)
+**	punpcklbw	%xmm0, %xmm0
+**	punpcklwd	%xmm0, %xmm0
+**	pshufd	\$0, %xmm0, %xmm0
+**	movaps	%xmm0, dest\(%rip\)
+**	movaps	%xmm0, dest\+16\(%rip\)
+**	movaps	%xmm0, dest\+32\(%rip\)
+**	movaps	%xmm0, dest\+48\(%rip\)
+**	movaps	%xmm0, dest\+64\(%rip\)
+**	movaps	%xmm0, dest\+80\(%rip\)
+**	movaps	%xmm0, dest\+96\(%rip\)
+**	movaps	%xmm0, dest\+112\(%rip\)
+**	movaps	%xmm0, dest\+128\(%rip\)
+**	movaps	%xmm0, dest\+144\(%rip\)
+**	movaps	%xmm0, dest\+160\(%rip\)
+**	ret
+**...
+*/
+
+char dest[177];
+
+void
+foo (int c)
+{
+  __builtin_memset (dest, c, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-22.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-22.c
new file mode 100644
index 0000000..f2bd698
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-22.c
@@ -0,0 +1,27 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=rep_8byte:8192:align,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movl	\$25, %ecx
+**	xorl	%eax, %eax
+**	movl	\$dest, %edi
+**	rep stosq
+**	movl	\$0, \(%rdi\)
+**	ret
+**...
+*/
+
+#define SIZE 204
+
+char dest[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-23.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-23.c
new file mode 100644
index 0000000..784f8dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-23.c
@@ -0,0 +1,67 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -minline-all-stringops -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movzbl	%dil, %edi
+**	movl	\$p, %eax
+**	movabsq	\$72340172838076673, %rdx
+**	imulq	%rdx, %rdi
+**	movq	%rdi, %xmm0
+**	punpcklqdq	%xmm0, %xmm0
+**	cmpq	\$64, %rsi
+**	jnb	.L18
+**.L2:
+**	movq	%rsi, %rcx
+**	andl	\$63, %ecx
+**	je	.L1
+**	xorl	%edx, %edx
+**	andl	\$1, %esi
+**	je	.L5
+**	movl	\$1, %edx
+**	movb	%dil, \(%rax\)
+**	cmpq	%rcx, %rdx
+**	jnb	.L19
+**.L5:
+**	movb	%dil, \(%rax,%rdx\)
+**	movb	%dil, 1\(%rax,%rdx\)
+**	addq	\$2, %rdx
+**	cmpq	%rcx, %rdx
+**	jb	.L5
+**.L1:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L18:
+**	movq	%rsi, %rdx
+**	xorl	%eax, %eax
+**	andq	\$-64, %rdx
+**.L3:
+**	movaps	%xmm0, p\(%rax\)
+**	addq	\$64, %rax
+**	movaps	%xmm0, p-48\(%rax\)
+**	movaps	%xmm0, p-32\(%rax\)
+**	movaps	%xmm0, p-16\(%rax\)
+**	cmpq	%rdx, %rax
+**	jb	.L3
+**	addq	\$p, %rax
+**	jmp	.L2
+**.L19:
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+
+#define WRITE_CHUNK 256
+char p[WRITE_CHUNK];
+
+void
+foo (int c, __SIZE_TYPE__ nbyte)
+{
+ __builtin_memset (p, c, nbyte);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c
new file mode 100644
index 0000000..621baf7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	vpxor	%xmm0, %xmm0, %xmm0
+**	vmovdqu8	%zmm0, 128\(%rdi\)
+**	vmovdqu8	%zmm0, \(%rdi\)
+**	vmovdqu8	%zmm0, 64\(%rdi\)
+**	vmovdqu8	%zmm0, 190\(%rdi\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c
new file mode 100644
index 0000000..712404b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c
@@ -0,0 +1,93 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$289360691352306692, %rax
+**	movq	%rax, %xmm0
+**	punpcklqdq	%xmm0, %xmm0
+**	cmpq	\$64, %rsi
+**	jnb	.L2
+**	testb	\$32, %sil
+**	jne	.L19
+**	testb	\$16, %sil
+**	jne	.L20
+**	testb	\$8, %sil
+**	jne	.L21
+**	testb	\$4, %sil
+**	jne	.L22
+**	testq	%rsi, %rsi
+**	jne	.L23
+**.L1:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	movups	%xmm0, -64\(%rdi,%rsi\)
+**	movups	%xmm0, -48\(%rdi,%rsi\)
+**	movups	%xmm0, -32\(%rdi,%rsi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$64, %rsi
+**	jb	.L1
+**	andq	\$-64, %rsi
+**	xorl	%eax, %eax
+**.L9:
+**	movups	%xmm0, \(%rdi,%rax\)
+**	movups	%xmm0, 16\(%rdi,%rax\)
+**	movups	%xmm0, 32\(%rdi,%rax\)
+**	movups	%xmm0, 48\(%rdi,%rax\)
+**	addq	\$64, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L9
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	movb	\$4, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L1
+**	movl	\$1028, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L19:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, 16\(%rdi\)
+**	movups	%xmm0, -32\(%rdi,%rsi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L20:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L21:
+**	movq	%rax, \(%rdi\)
+**	movq	%rax, -8\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	movl	\$67372036, \(%rdi\)
+**	movl	\$67372036, -4\(%rdi,%rsi\)
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c
new file mode 100644
index 0000000..f597395
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c
@@ -0,0 +1,102 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$289360691352306692, %rax
+**	vmovq	%rax, %xmm1
+**	vpbroadcastq	%xmm1, %ymm0
+**	cmpq	\$128, %rsi
+**	jnb	.L2
+**	testb	\$64, %sil
+**	jne	.L21
+**	testb	\$32, %sil
+**	jne	.L22
+**	testb	\$16, %sil
+**	jne	.L23
+**	testb	\$8, %sil
+**	jne	.L24
+**	testb	\$4, %sil
+**	jne	.L25
+**	testq	%rsi, %rsi
+**	jne	.L26
+**.L19:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu	%ymm0, -128\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -96\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$128, %rsi
+**	jb	.L19
+**	andq	\$-128, %rsi
+**	xorl	%eax, %eax
+**.L10:
+**	vmovdqu	%ymm0, \(%rdi,%rax\)
+**	vmovdqu	%ymm0, 32\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 64\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 96\(%rdi,%rax\)
+**	subq	\$-128, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L10
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	movb	\$4, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L19
+**	movl	\$1028, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L21:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, 32\(%rdi\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rsi\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	movq	%rax, \(%rdi\)
+**	movq	%rax, -8\(%rdi,%rsi\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	movl	\$67372036, \(%rdi\)
+**	movl	\$67372036, -4\(%rdi,%rsi\)
+**	jmp	.L19
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c
new file mode 100644
index 0000000..7ba1b742
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c
@@ -0,0 +1,109 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$289360691352306692, %rax
+**	vpbroadcastq	%rax, %zmm0
+**	cmpq	\$256, %rsi
+**	jnb	.L2
+**	testb	\$-128, %sil
+**	jne	.L22
+**	testb	\$64, %sil
+**	jne	.L23
+**	testb	\$32, %sil
+**	jne	.L24
+**	testb	\$16, %sil
+**	jne	.L25
+**	testb	\$8, %sil
+**	jne	.L26
+**	testb	\$4, %sil
+**	jne	.L27
+**	testq	%rsi, %rsi
+**	jne	.L28
+**.L20:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu64	%zmm0, -256\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -192\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$256, %rsi
+**	jb	.L20
+**	xorb	%sil, %sil
+**	xorl	%eax, %eax
+**.L11:
+**	vmovdqu64	%zmm0, \(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 64\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 128\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 192\(%rdi,%rax\)
+**	addq	\$256, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L11
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L28:
+**	movb	\$4, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L20
+**	movl	\$1028, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, 64\(%rdi\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	movq	%rax, \(%rdi\)
+**	movq	%rax, -8\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L27:
+**	movl	\$67372036, \(%rdi\)
+**	movl	\$67372036, -4\(%rdi,%rsi\)
+**	jmp	.L20
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c
new file mode 100644
index 0000000..62f61c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$72340172838076673, %rax
+**	movzbl	%sil, %esi
+**	imulq	%rax, %rsi
+**	movq	%rsi, %xmm0
+**	punpcklqdq	%xmm0, %xmm0
+**	cmpq	\$64, %rdx
+**	jnb	.L2
+**	testb	\$32, %dl
+**	jne	.L19
+**	testb	\$16, %dl
+**	jne	.L20
+**	testb	\$8, %dl
+**	jne	.L21
+**	testb	\$4, %dl
+**	jne	.L22
+**	testq	%rdx, %rdx
+**	jne	.L23
+**.L1:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	movups	%xmm0, -64\(%rdi,%rdx\)
+**	movups	%xmm0, -48\(%rdi,%rdx\)
+**	movups	%xmm0, -32\(%rdi,%rdx\)
+**	movups	%xmm0, -16\(%rdi,%rdx\)
+**	subq	\$1, %rdx
+**	cmpq	\$64, %rdx
+**	jb	.L1
+**	andq	\$-64, %rdx
+**	xorl	%eax, %eax
+**.L9:
+**	movups	%xmm0, \(%rdi,%rax\)
+**	movups	%xmm0, 16\(%rdi,%rax\)
+**	movups	%xmm0, 32\(%rdi,%rax\)
+**	movups	%xmm0, 48\(%rdi,%rax\)
+**	addq	\$64, %rax
+**	cmpq	%rdx, %rax
+**	jb	.L9
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	movb	%sil, \(%rdi\)
+**	testb	\$2, %dl
+**	je	.L1
+**	movw	%si, -2\(%rdi,%rdx\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L19:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, 16\(%rdi\)
+**	movups	%xmm0, -32\(%rdi,%rdx\)
+**	movups	%xmm0, -16\(%rdi,%rdx\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L20:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, -16\(%rdi,%rdx\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L21:
+**	movq	%rsi, \(%rdi\)
+**	movq	%rsi, -8\(%rdi,%rdx\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	movl	%esi, \(%rdi\)
+**	movl	%esi, -4\(%rdi,%rdx\)
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c
new file mode 100644
index 0000000..d12ab15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$72340172838076673, %rax
+**	movzbl	%sil, %esi
+**	imulq	%rax, %rsi
+**	vmovq	%rsi, %xmm1
+**	vpbroadcastq	%xmm1, %ymm0
+**	cmpq	\$128, %rdx
+**	jnb	.L2
+**	testb	\$64, %dl
+**	jne	.L21
+**	testb	\$32, %dl
+**	jne	.L22
+**	testb	\$16, %dl
+**	jne	.L23
+**	testb	\$8, %dl
+**	jne	.L24
+**	testb	\$4, %dl
+**	jne	.L25
+**	testq	%rdx, %rdx
+**	jne	.L26
+**.L19:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu	%ymm0, -128\(%rdi,%rdx\)
+**	vmovdqu	%ymm0, -96\(%rdi,%rdx\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rdx\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rdx\)
+**	subq	\$1, %rdx
+**	cmpq	\$128, %rdx
+**	jb	.L19
+**	andq	\$-128, %rdx
+**	xorl	%eax, %eax
+**.L10:
+**	vmovdqu	%ymm0, \(%rdi,%rax\)
+**	vmovdqu	%ymm0, 32\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 64\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 96\(%rdi,%rax\)
+**	subq	\$-128, %rax
+**	cmpq	%rdx, %rax
+**	jb	.L10
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	movb	%sil, \(%rdi\)
+**	testb	\$2, %dl
+**	je	.L19
+**	movw	%si, -2\(%rdi,%rdx\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L21:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, 32\(%rdi\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rdx\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rdx\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rdx\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rdx\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	movq	%rsi, \(%rdi\)
+**	movq	%rsi, -8\(%rdi,%rdx\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	movl	%esi, \(%rdi\)
+**	movl	%esi, -4\(%rdi,%rdx\)
+**	jmp	.L19
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c
new file mode 100644
index 0000000..1a0abe6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c
@@ -0,0 +1,110 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$72340172838076673, %rax
+**	movzbl	%sil, %esi
+**	imulq	%rax, %rsi
+**	vpbroadcastq	%rsi, %zmm0
+**	cmpq	\$256, %rdx
+**	jnb	.L2
+**	testb	\$-128, %dl
+**	jne	.L22
+**	testb	\$64, %dl
+**	jne	.L23
+**	testb	\$32, %dl
+**	jne	.L24
+**	testb	\$16, %dl
+**	jne	.L25
+**	testb	\$8, %dl
+**	jne	.L26
+**	testb	\$4, %dl
+**	jne	.L27
+**	testq	%rdx, %rdx
+**	jne	.L28
+**.L20:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu64	%zmm0, -256\(%rdi,%rdx\)
+**	vmovdqu64	%zmm0, -192\(%rdi,%rdx\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rdx\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rdx\)
+**	subq	\$1, %rdx
+**	cmpq	\$256, %rdx
+**	jb	.L20
+**	xorb	%dl, %dl
+**	xorl	%eax, %eax
+**.L11:
+**	vmovdqu64	%zmm0, \(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 64\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 128\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 192\(%rdi,%rax\)
+**	addq	\$256, %rax
+**	cmpq	%rdx, %rax
+**	jb	.L11
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L28:
+**	movb	%sil, \(%rdi\)
+**	testb	\$2, %dl
+**	je	.L20
+**	movw	%si, -2\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, 64\(%rdi\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rdx\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	movq	%rsi, \(%rdi\)
+**	movq	%rsi, -8\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L27:
+**	movl	%esi, \(%rdi\)
+**	movl	%esi, -4\(%rdi,%rdx\)
+**	jmp	.L20
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-25.c b/gcc/testsuite/gcc.target/i386/memset-strategy-25.c
index 1cc3de7..7bd5d43 100644
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-25.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-25.c
@@ -7,7 +7,11 @@
 **foo:
 **.LFB[0-9]+:
 **	.cfi_startproc
+**	movq	\$0, 221\(%rdi\)
 **	xorl	%eax, %eax
+**	movq	\$0, 229\(%rdi\)
+**	movq	\$0, 237\(%rdi\)
+**	movq	\$0, 245\(%rdi\)
 **.L[0-9]+:
 **	movl	%eax, %edx
 **	addl	\$32, %eax
@@ -17,6 +21,7 @@
 **	movq	\$0, 24\(%rdi,%rdx\)
 **	cmpl	\$224, %eax
 **	jb	.L[0-9]+
+**	ret
 **...
 */
 
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-29.c b/gcc/testsuite/gcc.target/i386/memset-strategy-29.c
index 61aef92..a33bf92 100644
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-29.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-29.c
@@ -8,7 +8,11 @@
 **...
 **.LFB[0-9]+:
 **	.cfi_startproc
+**	movq	\$0, 49\(%rdi\)
 **	xorl	%eax, %eax
+**	movq	\$0, 57\(%rdi\)
+**	movq	\$0, 65\(%rdi\)
+**	movq	\$0, 73\(%rdi\)
 **.L[0-9]+:
 **	movl	%eax, %edx
 **	addl	\$32, %eax
@@ -18,6 +22,7 @@
 **	movq	\$0, 24\(%rdi,%rdx\)
 **	cmpl	\$64, %eax
 **	jb	.L[0-9]+
+**	ret
 **...
 */
 
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-30.c b/gcc/testsuite/gcc.target/i386/memset-strategy-30.c
index 917f151..f3912f8 100644
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-30.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-30.c
@@ -8,7 +8,11 @@
 **...
 **.LFB[0-9]+:
 **	.cfi_startproc
+**	movq	\$0, 63\(%rdi\)
 **	xorl	%eax, %eax
+**	movq	\$0, 71\(%rdi\)
+**	movq	\$0, 79\(%rdi\)
+**	movq	\$0, 87\(%rdi\)
 **.L[0-9]+:
 **	movl	%eax, %edx
 **	addl	\$32, %eax
@@ -18,6 +22,7 @@
 **	movq	\$0, 24\(%rdi,%rdx\)
 **	cmpl	\$64, %eax
 **	jb	.L[0-9]+
+**	ret
 **...
 */
 
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-31.c b/gcc/testsuite/gcc.target/i386/memset-strategy-31.c
index 17a4df2..4791c4d 100644
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-31.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-31.c
@@ -9,6 +9,10 @@
 **...
 **	pxor	%xmm0, %xmm0
 **	xorl	%eax, %eax
+**	movups	%xmm0, 190\(%rdi\)
+**	movups	%xmm0, 206\(%rdi\)
+**	movups	%xmm0, 222\(%rdi\)
+**	movups	%xmm0, 238\(%rdi\)
 **.L[0-9]+:
 **	movl	%eax, %edx
 **	addl	\$64, %eax
diff --git a/gcc/testsuite/gcc.target/riscv/sat/sat_arith.h b/gcc/testsuite/gcc.target/riscv/sat/sat_arith.h
index 84f013f..3de89f4 100644
--- a/gcc/testsuite/gcc.target/riscv/sat/sat_arith.h
+++ b/gcc/testsuite/gcc.target/riscv/sat/sat_arith.h
@@ -4,6 +4,8 @@
 #include <stdint-gcc.h>
 #include <stdbool.h>
 
+typedef __uint128_t uint128_t;
+
 /******************************************************************************/
 /* Saturation Add (unsigned and signed)                                       */
 /******************************************************************************/
@@ -648,4 +650,25 @@ sat_s_trunc_##WT##_to_##NT##_fmt_8 (WT x)             \
 #define RUN_SAT_S_TRUNC_FMT_8(NT, WT, x) sat_s_trunc_##WT##_to_##NT##_fmt_8 (x)
 #define RUN_SAT_S_TRUNC_FMT_8_WRAP(NT, WT, x) RUN_SAT_S_TRUNC_FMT_8(NT, WT, x)
 
+/******************************************************************************/
+/* Saturation Mult (unsigned and signed)                                  */
+/******************************************************************************/
+
+#define DEF_SAT_U_MUL_FMT_1(NT, WT)             \
+NT __attribute__((noinline))                    \
+sat_u_mul_##NT##_from_##WT##_fmt_1 (NT a, NT b) \
+{                                               \
+  WT x = (WT)a * (WT)b;                         \
+  NT max = -1;                                  \
+  if (x > (WT)(max))                            \
+    return max;                                 \
+  else                                          \
+    return (NT)x;                               \
+}
+
+#define DEF_SAT_U_MUL_FMT_1_WRAP(NT, WT) DEF_SAT_U_MUL_FMT_1(NT, WT)
+#define RUN_SAT_U_MUL_FMT_1(NT, WT, a, b) \
+  sat_u_mul_##NT##_from_##WT##_fmt_1 (a, b)
+#define RUN_SAT_U_MUL_FMT_1_WRAP(NT, WT, a, b) RUN_SAT_U_MUL_FMT_1(NT, WT, a, b)
+
 #endif
diff --git a/gcc/testsuite/gcc.target/riscv/sat/sat_arith_data.h b/gcc/testsuite/gcc.target/riscv/sat/sat_arith_data.h
index f100688..bd33ff1 100644
--- a/gcc/testsuite/gcc.target/riscv/sat/sat_arith_data.h
+++ b/gcc/testsuite/gcc.target/riscv/sat/sat_arith_data.h
@@ -12,6 +12,7 @@
 
 #define TEST_BINARY_STRUCT_NAME(T, NAME) test_##T##_##NAME##_s
 #define TEST_BINARY_STRUCT_DECL(T, NAME) struct TEST_BINARY_STRUCT_NAME(T, NAME)
+#define TEST_BINARY_STRUCT_DECL_WRAP(T, NAME) TEST_BINARY_STRUCT_DECL(T, NAME)
 #define TEST_BINARY_STRUCT(T, NAME)       \
   struct TEST_BINARY_STRUCT_NAME(T, NAME) \
     {                                     \
@@ -37,6 +38,11 @@ TEST_BINARY_STRUCT (uint16_t, usadd)
 TEST_BINARY_STRUCT (uint32_t, usadd)
 TEST_BINARY_STRUCT (uint64_t, usadd)
 
+TEST_BINARY_STRUCT (uint8_t, usmul)
+TEST_BINARY_STRUCT (uint16_t, usmul)
+TEST_BINARY_STRUCT (uint32_t, usmul)
+TEST_BINARY_STRUCT (uint64_t, usmul)
+
 TEST_BINARY_STRUCT (int8_t,  ssadd)
 TEST_BINARY_STRUCT (int16_t, ssadd)
 TEST_BINARY_STRUCT (int32_t, ssadd)
@@ -433,4 +439,60 @@ TEST_BINARY_STRUCT_DECL(int64_t, sssub) TEST_BINARY_DATA(int64_t, sssub)[] =
   {  9223372036854775806ll,   9223372036854775800ll,                       6},
 };
 
+TEST_BINARY_STRUCT_DECL(uint8_t, usmul) TEST_BINARY_DATA(uint8_t, usmul)[] =
+{
+  {      0,     0,      0, },
+  {      0,     1,      0, },
+  {      1,     1,      1, },
+  {      1,   127,    127, },
+  {      2,   127,    254, },
+  {      3,   127,    255, },
+  {    127,   127,    255, },
+  {      1,   255,    255, },
+  {    127,   255,    255, },
+  {    255,   255,    255, },
+};
+
+TEST_BINARY_STRUCT_DECL(uint16_t, usmul) TEST_BINARY_DATA(uint16_t, usmul)[] =
+{
+  {      0,     0,      0, },
+  {      0,     1,      0, },
+  {      1,     1,      1, },
+  {      1, 32767,  32767, },
+  {      2, 32767,  65534, },
+  {      3, 32767,  65535, },
+  {  32767, 32767,  65535, },
+  {      1, 65535,  65535, },
+  {  32767, 65535,  65535, },
+  {  65535, 65535,  65535, },
+};
+
+TEST_BINARY_STRUCT_DECL(uint32_t, usmul) TEST_BINARY_DATA(uint32_t, usmul)[] =
+{
+  {          0,          0,          0, },
+  {          0,          1,          0, },
+  {          1,          1,          1, },
+  {          1, 2147483647, 2147483647, },
+  {          2, 2147483647, 4294967294, },
+  {          3, 2147483647, 4294967295, },
+  { 2147483647, 2147483647, 4294967295, },
+  {          1, 4294967295, 4294967295, },
+  { 2147483647, 4294967295, 4294967295, },
+  { 4294967295, 4294967295, 4294967295, },
+};
+
+TEST_BINARY_STRUCT_DECL(uint64_t, usmul) TEST_BINARY_DATA(uint64_t, usmul)[] =
+{
+  {                       0,                       0,                       0, },
+  {                       0,                       1,                       0, },
+  {                       1,                       1,                       1, },
+  {                       1,  9223372036854775807ull,  9223372036854775807ull, },
+  {                       2,  9223372036854775807ull, 18446744073709551614ull, },
+  {                       3,  9223372036854775807ull, 18446744073709551615ull, },
+  {  9223372036854775807ull,  9223372036854775807ull, 18446744073709551615ull, },
+  {                       1, 18446744073709551615ull, 18446744073709551615ull, },
+  {  9223372036854775807ull, 18446744073709551615ull, 18446744073709551615ull, },
+  { 18446744073709551615ull, 18446744073709551615ull, 18446744073709551615ull, },
+};
+
 #endif
diff --git a/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u16-from-u128.c b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u16-from-u128.c
new file mode 100644
index 0000000..b60c91c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u16-from-u128.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc -mabi=lp64d -fdump-tree-optimized" } */
+
+#include "sat_arith.h"
+
+#define NT uint16_t
+#define WT uint128_t
+
+DEF_SAT_U_MUL_FMT_1_WRAP(NT, WT)
+
+/* { dg-final { scan-tree-dump-times ".SAT_MUL" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u32-from-u128.c b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u32-from-u128.c
new file mode 100644
index 0000000..1ac6f39
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u32-from-u128.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc -mabi=lp64d -fdump-tree-optimized" } */
+
+#include "sat_arith.h"
+
+#define NT uint32_t
+#define WT uint128_t
+
+DEF_SAT_U_MUL_FMT_1_WRAP(NT, WT)
+
+/* { dg-final { scan-tree-dump-times ".SAT_MUL" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u64-from-u128.c b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u64-from-u128.c
new file mode 100644
index 0000000..af12d82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u64-from-u128.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc -mabi=lp64d -fdump-tree-optimized" } */
+
+#include "sat_arith.h"
+
+#define NT uint64_t
+#define WT uint128_t
+
+DEF_SAT_U_MUL_FMT_1_WRAP(NT, WT)
+
+/* { dg-final { scan-tree-dump-times ".SAT_MUL" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u8-from-u128.c b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u8-from-u128.c
new file mode 100644
index 0000000..c73353a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-1-u8-from-u128.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc -mabi=lp64d -fdump-tree-optimized" } */
+
+#include "sat_arith.h"
+
+#define NT uint8_t
+#define WT uint128_t
+
+DEF_SAT_U_MUL_FMT_1_WRAP(NT, WT)
+
+/* { dg-final { scan-tree-dump-times ".SAT_MUL" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u16-from-u128.c b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u16-from-u128.c
new file mode 100644
index 0000000..395a4cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u16-from-u128.c
@@ -0,0 +1,16 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99" } */
+
+#include "sat_arith.h"
+#include "sat_arith_data.h"
+
+#define NT               uint16_t
+#define WT               uint128_t
+#define NAME             usmul
+#define DATA             TEST_BINARY_DATA_WRAP(NT, NAME)
+#define T                TEST_BINARY_STRUCT_DECL_WRAP(NT, NAME)
+#define RUN_BINARY(x, y) RUN_SAT_U_MUL_FMT_1_WRAP(NT, WT, x, y)
+
+DEF_SAT_U_MUL_FMT_1_WRAP(NT, WT)
+
+#include "scalar_sat_binary_run_xxx.h"
diff --git a/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u32-from-u128.c b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u32-from-u128.c
new file mode 100644
index 0000000..3c8b728
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u32-from-u128.c
@@ -0,0 +1,16 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99" } */
+
+#include "sat_arith.h"
+#include "sat_arith_data.h"
+
+#define NT               uint32_t
+#define WT               uint128_t
+#define NAME             usmul
+#define DATA             TEST_BINARY_DATA_WRAP(NT, NAME)
+#define T                TEST_BINARY_STRUCT_DECL_WRAP(NT, NAME)
+#define RUN_BINARY(x, y) RUN_SAT_U_MUL_FMT_1_WRAP(NT, WT, x, y)
+
+DEF_SAT_U_MUL_FMT_1_WRAP(NT, WT)
+
+#include "scalar_sat_binary_run_xxx.h"
diff --git a/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u64-from-u128.c b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u64-from-u128.c
new file mode 100644
index 0000000..e5572de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u64-from-u128.c
@@ -0,0 +1,16 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99" } */
+
+#include "sat_arith.h"
+#include "sat_arith_data.h"
+
+#define NT               uint64_t
+#define WT               uint128_t
+#define NAME             usmul
+#define DATA             TEST_BINARY_DATA_WRAP(NT, NAME)
+#define T                TEST_BINARY_STRUCT_DECL_WRAP(NT, NAME)
+#define RUN_BINARY(x, y) RUN_SAT_U_MUL_FMT_1_WRAP(NT, WT, x, y)
+
+DEF_SAT_U_MUL_FMT_1_WRAP(NT, WT)
+
+#include "scalar_sat_binary_run_xxx.h"
diff --git a/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u8-from-u128.c b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u8-from-u128.c
new file mode 100644
index 0000000..2e9c39a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/sat/sat_u_mul-run-1-u8-from-u128.c
@@ -0,0 +1,16 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99" } */
+
+#include "sat_arith.h"
+#include "sat_arith_data.h"
+
+#define NT               uint8_t
+#define WT               uint128_t
+#define NAME             usmul
+#define DATA             TEST_BINARY_DATA_WRAP(NT, NAME)
+#define T                TEST_BINARY_STRUCT_DECL_WRAP(NT, NAME)
+#define RUN_BINARY(x, y) RUN_SAT_U_MUL_FMT_1_WRAP(NT, WT, x, y)
+
+DEF_SAT_U_MUL_FMT_1_WRAP(NT, WT)
+
+#include "scalar_sat_binary_run_xxx.h"
diff --git a/gcc/testsuite/gcc.target/s390/vector/pattern-avg-1.c b/gcc/testsuite/gcc.target/s390/vector/pattern-avg-1.c
new file mode 100644
index 0000000..a15301a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/pattern-avg-1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z16 -ftree-vectorize -fdump-tree-optimized" } */
+
+#define TEST(T1,T2,N)                                                   \
+  void                                                                  \
+  avg##T1 (signed T1 *__restrict res, signed T1 *__restrict a,          \
+           signed T1 *__restrict b)                                     \
+  {                                                                     \
+    for (int i = 0; i < N; ++i)                                         \
+      res[i] = ((signed T2)a[i] + b[i] + 1) >> 1;                       \
+  }                                                                     \
+                                                                        \
+  void                                                                  \
+  uavg##T1 (unsigned T1 *__restrict res, unsigned T1 *__restrict a,     \
+            unsigned T1 *__restrict b)                                  \
+  {                                                                     \
+    for (int i = 0; i < N; ++i)                                         \
+      res[i] = ((unsigned T2)a[i] + b[i] + 1) >> 1;                     \
+  }
+
+TEST(char,short,16)
+TEST(short,int,8)
+TEST(int,long,4)
+TEST(long,__int128,2)
+
+/* { dg-final { scan-tree-dump-times "\.AVG_CEIL" 8 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/s390/vector/pattern-mulh-1.c b/gcc/testsuite/gcc.target/s390/vector/pattern-mulh-1.c
new file mode 100644
index 0000000..cd8e4e7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/pattern-mulh-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=arch15 -ftree-vectorize -fdump-tree-optimized" } */
+
+#define TEST(T1,T2,N,S)                                                 \
+  void                                                                  \
+  mulh##T1 (signed T1 *__restrict res,                                  \
+            signed T1 *__restrict l,                                    \
+            signed T1 *__restrict r)                                    \
+  {                                                                     \
+    for (int i = 0; i < N; ++i)                                         \
+      res[i] = (signed T1) (((signed T2)l[i] * (signed T2)r[i]) >> S);  \
+  }                                                                     \
+                                                                        \
+  void                                                                  \
+  umulh##T1 (unsigned T1 *__restrict res,                               \
+             unsigned T1 *__restrict l,                                 \
+             unsigned T1 *__restrict r)                                 \
+  {                                                                     \
+    for (int i = 0; i < N; ++i)                                         \
+      res[i] = (unsigned T1)                                            \
+        (((unsigned T2)l[i] * (unsigned T2)r[i]) >> S);                 \
+  }
+
+TEST(char,short,16,8)
+TEST(short,int,8,16)
+TEST(int,long,4,32)
+TEST(long,__int128,2,64)
+
+/* { dg-final { scan-tree-dump-times "\.MULH" 8 "optimized" } } */
diff --git a/gcc/testsuite/gfortran.dg/move_alloc_20.f03 b/gcc/testsuite/gfortran.dg/move_alloc_20.f03
new file mode 100644
index 0000000..20403c3
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/move_alloc_20.f03
@@ -0,0 +1,151 @@
+! { dg-do run }
+!
+! Check the presence of the pre and post code of the FROM and TO arguments
+! of the MOVE_ALLOC intrinsic subroutine.
+
+module m
+  implicit none
+  type :: t
+    integer, allocatable :: a(:)
+  end type
+end module 
+
+module pre
+  use m
+  implicit none
+  private
+  public :: check_pre
+
+contains
+
+  subroutine check_pre
+    integer, parameter :: n = 5
+    type(t) :: x(n)
+    integer, allocatable :: tmp(:)
+    integer :: array(4) = [ -1, 0, 1, 2 ]
+    integer :: i
+
+    if (allocated(tmp)) error stop 1
+
+    tmp = [17]
+
+    if (.not. allocated(tmp)) error stop 11
+    if (any(shape(tmp) /= [1])) error stop 12
+    if (any(tmp /= [17])) error stop 13
+    do i=1,n
+      if (allocated(x(i)%a)) error stop 14
+    end do
+
+    ! Check that the index of X is properly computed for the evaluation of TO.
+    call move_alloc(tmp, x(sum(array))%a)
+
+    do i=1,n
+      if (i == 2) cycle
+      if (allocated(x(i)%a)) error stop 21
+    end do
+    if (.not. allocated(x(2)%a)) error stop 22
+    if (any(shape(x(2)%a) /= [1])) error stop 23
+    if (any(x(2)%a /= [17])) error stop 24
+    if (allocated(tmp)) error stop 25
+
+    ! Check that the index of X is properly computed for the evaluation of FROM.
+    call move_alloc(x(sum(array))%a, tmp)
+
+    if (.not. allocated(tmp)) error stop 31
+    if (any(shape(tmp) /= [1])) error stop 32
+    if (any(tmp /= [17])) error stop 33
+    do i=1,n
+      if (allocated(x(i)%a)) error stop 34
+    end do
+  end subroutine
+
+end module
+
+module post
+  use m
+  implicit none
+  private
+  public :: check_post
+  integer, parameter :: n = 5
+  type(t), target :: x(n)
+  type :: u
+    integer :: a
+  contains
+    final :: finalize
+  end type
+  integer :: finalization_count = 0
+
+contains
+
+  function idx(arg)
+    type(u) :: arg
+    integer :: idx
+    idx = mod(arg%a, n)
+  end function
+
+  subroutine check_post
+    type(u) :: y
+    integer, allocatable :: tmp(:)
+    integer, target :: array(4) = [ -1, 0, 1, 2 ]
+    integer :: i
+
+    y%a = 12
+
+    if (allocated(tmp)) error stop 1
+
+    tmp = [37]
+
+    if (.not. allocated(tmp)) error stop 11
+    if (any(shape(tmp) /= [1])) error stop 12
+    if (any(tmp /= [37])) error stop 13
+    if (finalization_count /= 0) error stop 14
+    do i=1,n
+      if (allocated(x(i)%a)) error stop 15
+    end do
+
+    ! Check that the cleanup code for the evaluation of TO is properly
+    ! executed after MOVE_ALLOC: the result of GET_U should be finalized.
+    call move_alloc(tmp, x(idx(get_u(y)))%a)
+
+    do i=1,n
+      if (i == 2) cycle
+      if (allocated(x(i)%a)) error stop 21
+    end do
+    if (.not. allocated(x(2)%a)) error stop 22
+    if (any(shape(x(2)%a) /= [1])) error stop 23
+    if (any(x(2)%a /= [37])) error stop 24
+    if (allocated(tmp)) error stop 25
+    if (finalization_count /= 1) error stop 26
+
+    ! Check that the cleanup code for the evaluation of FROM is properly
+    ! executed after MOVE_ALLOC: the result of GET_U should be finalized.
+    call move_alloc(x(idx(get_u(y)))%a, tmp)
+
+    if (.not. allocated(tmp)) error stop 31
+    if (any(shape(tmp) /= [1])) error stop 32
+    if (any(tmp /= [37])) error stop 33
+    if (finalization_count /= 2) error stop 34
+    do i=1,n
+      if (allocated(x(i)%a)) error stop 35
+    end do
+  end subroutine
+
+  function get_u(arg)
+    type(u) :: arg, get_u
+    get_u = arg
+  end function get_u
+
+  subroutine finalize(obj)
+    type(u) :: obj
+    finalization_count = finalization_count + 1
+  end subroutine
+
+end module
+
+program p
+  use pre
+  use post
+  implicit none
+  call check_pre
+  call check_post
+end program
diff --git a/gcc/tree-call-cdce.cc b/gcc/tree-call-cdce.cc
index 649c1e2..3edea75 100644
--- a/gcc/tree-call-cdce.cc
+++ b/gcc/tree-call-cdce.cc
@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "builtins.h"
 #include "internal-fn.h"
 #include "tree-dfa.h"
+#include "tree-eh.h"
 
 
 /* This pass serves two closely-related purposes:
@@ -1222,8 +1223,20 @@ use_internal_fn (gcall *call)
     {
       /* Skip the call if LHS == LHS.  If we reach here, EDOM is the only
 	 valid errno value and it is used iff the result is NaN.  */
-      conds.quick_push (gimple_build_cond (EQ_EXPR, lhs, lhs,
-					   NULL_TREE, NULL_TREE));
+      /* In the case of non call exceptions, with signaling NaNs, EQ_EXPR
+	 can throw an exception and that can't be part of the GIMPLE_COND. */
+      if (flag_exceptions
+	  && cfun->can_throw_non_call_exceptions
+	  && operation_could_trap_p (EQ_EXPR, true, false, NULL_TREE))
+	{
+	  tree b = make_ssa_name (boolean_type_node);
+	  conds.quick_push (gimple_build_assign (b, EQ_EXPR, lhs, lhs));
+	  conds.quick_push (gimple_build_cond (NE_EXPR, b, boolean_false_node,
+					       NULL_TREE, NULL_TREE));
+	}
+      else
+	conds.quick_push (gimple_build_cond (EQ_EXPR, lhs, lhs,
+					     NULL_TREE, NULL_TREE));
       nconds++;
 
       /* Try replacing the original call with a direct assignment to
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 72763fd..9a5479a 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -4623,6 +4623,14 @@ verify_gimple_assign_single (gassign *stmt)
       return true;
     }
 
+  /* LHS can't be a constant or an address expression. */
+  if (CONSTANT_CLASS_P (lhs)|| TREE_CODE (lhs) == ADDR_EXPR)
+    {
+      error ("invalid LHS (%qs) for assignment: %qs",
+	     get_tree_code_name (TREE_CODE (lhs)), code_name);
+      return true;
+    }
+
   if (gimple_clobber_p (stmt)
       && !(DECL_P (lhs) || TREE_CODE (lhs) == MEM_REF))
     {
@@ -4745,6 +4753,11 @@ verify_gimple_assign_single (gassign *stmt)
 
 	  if (CONSTRUCTOR_NELTS (rhs1) == 0)
 	    return res;
+	  if (!is_gimple_reg (lhs))
+	    {
+	      error ("non-register as LHS with vector constructor");
+	      return true;
+	    }
 	  /* For vector CONSTRUCTORs we require that either it is empty
 	     CONSTRUCTOR, or it is a CONSTRUCTOR of smaller vector elements
 	     (then the element count must be correct to cover the whole
diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc
index 5ac4280..51a5723 100644
--- a/gcc/tree-ssa-dse.cc
+++ b/gcc/tree-ssa-dse.cc
@@ -181,10 +181,10 @@ initialize_ao_ref_for_dse (gimple *stmt, ao_ref *write, bool may_def_ok = false)
 	       can provide a may-def variant.  */
 	    if (may_def_ok)
 	      {
-		ao_ref_init_from_ptr_and_size (
-		  write, gimple_call_arg (stmt, 0),
-		  TYPE_SIZE_UNIT (
-		    TREE_TYPE (gimple_call_arg (stmt, stored_value_index))));
+		ao_ref_init_from_ptr_and_range (
+		  write, gimple_call_arg (stmt, 0), true, 0, -1,
+		  tree_to_poly_int64 (TYPE_SIZE (
+		    TREE_TYPE (gimple_call_arg (stmt, stored_value_index)))));
 		return true;
 	      }
 	    break;
diff --git a/gcc/tree-ssa-live.cc b/gcc/tree-ssa-live.cc
index 5b8bfd0..5e08913 100644
--- a/gcc/tree-ssa-live.cc
+++ b/gcc/tree-ssa-live.cc
@@ -702,7 +702,10 @@ dump_scope_block (FILE *file, int indent, tree scope, dump_flags_t flags)
   if (LOCATION_LOCUS (BLOCK_SOURCE_LOCATION (scope)) != UNKNOWN_LOCATION)
     {
       expanded_location s = expand_location (BLOCK_SOURCE_LOCATION (scope));
-      fprintf (file, " %s:%i", s.file, s.line);
+      fprintf (file, " %s:%i:%i", s.file, s.line, s.column);
+      if (has_discriminator (BLOCK_SOURCE_LOCATION (scope)))
+	fprintf (file, " discrim %i",
+		 get_discriminator_from_loc (BLOCK_SOURCE_LOCATION (scope)));
     }
   if (BLOCK_ABSTRACT_ORIGIN (scope))
     {
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 4cfcc42..ca98205 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -4064,6 +4064,7 @@ arith_overflow_check_p (gimple *stmt, gimple *cast_stmt, gimple *&use_stmt,
 extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
 extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
 extern bool gimple_unsigned_integer_sat_trunc (tree, tree*, tree (*)(tree));
+extern bool gimple_unsigned_integer_sat_mul (tree, tree*, tree (*)(tree));
 
 extern bool gimple_signed_integer_sat_add (tree, tree*, tree (*)(tree));
 extern bool gimple_signed_integer_sat_sub (tree, tree*, tree (*)(tree));
@@ -4217,6 +4218,30 @@ match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gassign *stmt)
 }
 
 /*
+ * Try to match saturation unsigned mul.
+ *   _1 = (unsigned int) a_6(D);
+ *   _2 = (unsigned int) b_7(D);
+ *   x_8 = _1 * _2;
+ *   overflow_9 = x_8 > 255;
+ *   _3 = (unsigned char) overflow_9;
+ *   _4 = -_3;
+ *   _5 = (unsigned char) x_8;
+ *   _10 = _4 | _5;
+ *   =>
+ *   _10 = .SAT_SUB (a_6, b_7);  */
+
+static void
+match_unsigned_saturation_mul (gimple_stmt_iterator *gsi, gassign *stmt)
+{
+  tree ops[2];
+  tree lhs = gimple_assign_lhs (stmt);
+
+  if (gimple_unsigned_integer_sat_mul (lhs, ops, NULL))
+    build_saturation_binary_arith_call_and_replace (gsi, IFN_SAT_MUL, lhs,
+						    ops[0], ops[1]);
+}
+
+/*
  * Try to match saturation unsigned sub.
  *  <bb 2> [local count: 1073741824]:
  *  if (x_2(D) > y_3(D))
@@ -6469,6 +6494,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
 	      break;
 
 	    case NOP_EXPR:
+	      match_unsigned_saturation_mul (&gsi, as_a<gassign *> (stmt));
 	      match_unsigned_saturation_trunc (&gsi, as_a<gassign *> (stmt));
 	      match_saturation_add_with_assign (&gsi, as_a<gassign *> (stmt));
 	      break;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 95406b4..5767a35 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -14410,6 +14410,8 @@ supportable_widening_operation (vec_info *vinfo,
 
       internal_fn lo, hi, even, odd;
       lookup_hilo_internal_fn (ifn, &lo, &hi);
+      if (BYTES_BIG_ENDIAN)
+	std::swap (lo, hi);
       *code1 = as_combined_fn (lo);
       *code2 = as_combined_fn (hi);
       optab1 = direct_internal_fn_optab (lo, {vectype, vectype});