17 files changed, 541 insertions, 29 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 264bbd2..4a2a79f 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,46 @@
+2025-04-19  Jeff Law  <jlaw@ventanamicro.com>
+
+	PR target/119865
+	* config/riscv/riscv.cc (parse_features_for_version): Do not
+	explicitly free the architecture string.
+
+2025-04-19  Jeff Law  <jlaw@ventanamicro.com>
+
+	PR target/118410
+	* config/riscv/bitmanip.md (logical with constant argument): New
+	splitter for cases where synthesizing ~C is cheaper than synthesizing
+	the original constant C.
+
+2025-04-19  Jan Hubicka  <hubicka@ucw.cz>
+
+	* config/i386/i386.cc (vec_fp_conversion_cost): New function.
+	(ix86_rtx_costs): Use it for SSE/AVX FP conversoins.
+	(ix86_builtin_vectorization_cost): Fix indentation;
+	and use vec_fp_conversion_cost in vec_promote_demote.
+	(fp_conversion_stmt_cost): New function.
+	(ix86_vector_costs::add_stmt_cost): Use it to cost NOP_EXPR
+	and vec_promote_demote.
+	* config/i386/i386.h (struct processor_costs):
+	* config/i386/x86-tune-costs.h (struct processor_costs):
+
+2025-04-19  Andrew Pinski  <quic_apinski@quicinc.com>
+
+	PR rtl-optimization/111949
+	* combine.cc (find_split_point): Add a split point
+	for `(and (not X) Y)` if not in the outer set already.
+
+2025-04-19  Jiaxun Yang  <jiaxun.yang@flygoat.com>
+
+	PR target/111814
+	* config/sh/sh-modes.def (RESET_FLOAT_FORMAT): Use mips format.
+	(FLOAT_MODE): Use mips mode.
+
+2025-04-19  Maciej W. Rozycki  <macro@orcam.me.uk>
+
+	* config/alpha/alpha.cc
+	(alpha_get_mem_rtx_alignment_and_offset): Recurse into
+	COMPONENT_REF nodes.
+
 2025-04-18  Jeff Law  <jlaw@ventanamicro.com>
 
 	* config/riscv/bitmanip.md (*bext<mode>_mask_pos): New pattern
diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP
index f0d1b43..2aaf995 100644
--- a/gcc/DATESTAMP
+++ b/gcc/DATESTAMP
@@ -1 +1 @@
-20250419
+20250421
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index cdfd94d..36f71eb 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -19256,8 +19256,6 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
   e1 = gen_reg_rtx (mode);
   x1 = gen_reg_rtx (mode);
 
-  /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
-
   b = force_reg (mode, b);
 
   /* x0 = rcp(b) estimate */
@@ -19270,20 +19268,42 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
 						UNSPEC_RCP)));
 
-  /* e0 = x0 * b */
-  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
+  unsigned vector_size = GET_MODE_SIZE (mode);
+
+  /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
+     N-R step with 2 fma implementation.  */
+  if (TARGET_FMA
+      || (TARGET_AVX512F && vector_size == 64)
+      || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
+    {
+      /* e0 = x0 * a  */
+      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
+      /* e1 = e0 * b - a  */
+      emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
+					       gen_rtx_NEG (mode, a))));
+      /* res = - e1 * x0 + e0  */
+      emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
+					       gen_rtx_NEG (mode, e1),
+					       x0, e0)));
+    }
+  else
+    /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
+    {
+      /* e0 = x0 * b */
+      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
 
-  /* e0 = x0 * e0 */
-  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
+      /* e1 = x0 + x0 */
+      emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
 
-  /* e1 = x0 + x0 */
-  emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
+      /* e0 = x0 * e0 */
+      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
 
-  /* x1 = e1 - e0 */
-  emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
+      /* x1 = e1 - e0 */
+      emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
 
-  /* res = a * x1 */
-  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
+      /* res = a * x1 */
+      emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
+    }
 }
 
 /* Output code to perform a Newton-Rhapson approximation of a
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 38df84f..28603c2 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "i386-features.h"
 #include "function-abi.h"
 #include "rtl-error.h"
+#include "gimple-pretty-print.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -21816,6 +21817,25 @@ ix86_insn_cost (rtx_insn *insn, bool speed)
   return insn_cost + pattern_cost (PATTERN (insn), speed);
 }
 
+/* Return cost of SSE/AVX FP->FP conversion (extensions and truncates).  */
+
+static int
+vec_fp_conversion_cost (const struct processor_costs *cost, int size)
+{
+  if (size < 128)
+    return cost->cvtss2sd;
+  else if (size < 256)
+    {
+      if (TARGET_SSE_SPLIT_REGS)
+	return cost->cvtss2sd * size / 64;
+      return cost->cvtss2sd;
+    }
+  if (size < 512)
+    return cost->vcvtps2pd256;
+  else
+    return cost->vcvtps2pd512;
+}
+
 /* Compute a (partial) cost for rtx X.  Return true if the complete
    cost has been computed, and false if subexpressions should be
    scanned.  In either case, *TOTAL contains the cost result.  */
@@ -22479,17 +22499,18 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
       return false;
 
     case FLOAT_EXTEND:
+      /* x87 represents all values extended to 80bit.  */
       if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
 	*total = 0;
       else
-        *total = ix86_vec_cost (mode, cost->addss);
+	*total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
       return false;
 
     case FLOAT_TRUNCATE:
       if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
 	*total = cost->fadd;
       else
-        *total = ix86_vec_cost (mode, cost->addss);
+	*total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
       return false;
 
     case ABS:
@@ -24683,7 +24704,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
   switch (type_of_cost)
     {
       case scalar_stmt:
-        return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
+	return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
 
       case scalar_load:
 	/* load/store costs are relative to register move which is 2. Recompute
@@ -24754,7 +24775,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
         return ix86_cost->cond_not_taken_branch_cost;
 
       case vec_perm:
+	return ix86_vec_cost (mode, ix86_cost->sse_op);
+
       case vec_promote_demote:
+	if (fp)
+	  return vec_fp_conversion_cost (ix86_tune_cost, mode);
         return ix86_vec_cost (mode, ix86_cost->sse_op);
 
       case vec_construct:
@@ -25232,6 +25257,32 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
   return new ix86_vector_costs (vinfo, costing_for_scalar);
 }
 
+/* Return cost of statement doing FP conversion.  */
+
+static unsigned
+fp_conversion_stmt_cost (machine_mode mode, gimple *stmt, bool scalar_p)
+{
+  int outer_size
+    = tree_to_uhwi
+	(TYPE_SIZE
+	    (TREE_TYPE (gimple_assign_lhs (stmt))));
+  int inner_size
+    = tree_to_uhwi
+	(TYPE_SIZE
+	    (TREE_TYPE (gimple_assign_rhs1 (stmt))));
+  int stmt_cost = vec_fp_conversion_cost
+		    (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+  /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end
+     up doing two conversions and packing them.  */
+  if (!scalar_p && inner_size > outer_size)
+    {
+      int n = inner_size / outer_size;
+      stmt_cost = stmt_cost * n
+		  + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
+    }
+  return stmt_cost;
+}
+
 unsigned
 ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 				  stmt_vec_info stmt_info, slp_tree node,
@@ -25342,6 +25393,9 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	        (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
 		 TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
 	    stmt_cost = 0;
+	  else if (fp)
+	    stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt,
+						 scalar_p);
 	  break;
 
 	case BIT_IOR_EXPR:
@@ -25383,6 +25437,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	break;
       }
 
+  if (kind == vec_promote_demote
+      && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
+    stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt, scalar_p);
+
   /* If we do elementwise loads into a vector then we are bound by
      latency and execution resources for the many scalar loads
      (AGU and load ports).  Try to account for this by scaling the
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8507243..18aa42d 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -207,6 +207,12 @@ struct processor_costs {
   const int divsd;		/* cost of DIVSD instructions.  */
   const int sqrtss;		/* cost of SQRTSS instructions.  */
   const int sqrtsd;		/* cost of SQRTSD instructions.  */
+  const int cvtss2sd;		/* cost SSE FP conversions,
+				   such as CVTSS2SD.  */
+  const int vcvtps2pd256;	/* cost 256bit packed FP conversions,
+				   such as VCVTPD2PS with larger reg in ymm.  */
+  const int vcvtps2pd512;	/* cost 512bit packed FP conversions,
+				   such as VCVTPD2PS with larger reg in zmm.  */
   const int reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp;
 				/* Specify reassociation width for integer,
 				   fp, vector integer and vector fp
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 9477345..cddcf61 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -121,16 +121,19 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
 
-  COSTS_N_BYTES (2),			/* cost of cheap SSE instruction.  */
-  COSTS_N_BYTES (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_BYTES (2),			/* cost of MULSS instruction.  */
-  COSTS_N_BYTES (2),			/* cost of MULSD instruction.  */
-  COSTS_N_BYTES (2),			/* cost of FMA SS instruction.  */
-  COSTS_N_BYTES (2),			/* cost of FMA SD instruction.  */
-  COSTS_N_BYTES (2),			/* cost of DIVSS instruction.  */
-  COSTS_N_BYTES (2),			/* cost of DIVSD instruction.  */
-  COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
-  COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
+  COSTS_N_BYTES (4),			/* cost of cheap SSE instruction.  */
+  COSTS_N_BYTES (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_BYTES (4),			/* cost of MULSS instruction.  */
+  COSTS_N_BYTES (4),			/* cost of MULSD instruction.  */
+  COSTS_N_BYTES (4),			/* cost of FMA SS instruction.  */
+  COSTS_N_BYTES (4),			/* cost of FMA SD instruction.  */
+  COSTS_N_BYTES (4),			/* cost of DIVSS instruction.  */
+  COSTS_N_BYTES (4),			/* cost of DIVSD instruction.  */
+  COSTS_N_BYTES (4),			/* cost of SQRTSS instruction.  */
+  COSTS_N_BYTES (4),			/* cost of SQRTSD instruction.  */
+  COSTS_N_BYTES (4),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_BYTES (4),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_BYTES (6),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   ix86_size_memcpy,
   ix86_size_memset,
@@ -243,6 +246,9 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (88),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (27),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (54),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (108),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   i386_memcpy,
   i386_memset,
@@ -356,6 +362,9 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (74),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (8),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (16),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (32),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   i486_memcpy,
   i486_memset,
@@ -467,6 +476,9 @@ struct processor_costs pentium_cost = {
   COSTS_N_INSNS (39),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
@@ -571,6 +583,9 @@ struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (5),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (10),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (20),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
@@ -690,6 +705,9 @@ struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (18),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
@@ -800,6 +818,9 @@ struct processor_costs geode_cost = {
   COSTS_N_INSNS (47),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (12),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (24),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   geode_memcpy,
   geode_memset,
@@ -913,6 +934,9 @@ struct processor_costs k6_cost = {
   COSTS_N_INSNS (56),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (2),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (4),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (8),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   k6_memcpy,
   k6_memset,
@@ -1027,6 +1051,9 @@ struct processor_costs athlon_cost = {
   COSTS_N_INSNS (24),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (8),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (16),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   athlon_memcpy,
   athlon_memset,
@@ -1150,6 +1177,9 @@ struct processor_costs k8_cost = {
   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (8),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (16),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   k8_memcpy,
   k8_memset,
@@ -1281,6 +1311,9 @@ struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (8),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (16),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   amdfam10_memcpy,
   amdfam10_memset,
@@ -1405,6 +1438,9 @@ const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (7),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver_memcpy,
   bdver_memset,
@@ -1553,6 +1589,10 @@ struct processor_costs znver1_cost = {
   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  /* Real latency is 4, but for split regs multiply cost of half op by 2.  */
+  COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
      and it can execute 2 integer additions and 2 multiplications thus
      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
@@ -1712,6 +1752,9 @@ struct processor_costs znver2_cost = {
   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (5),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (10),			/* cost of 512bit VCVTPS2PD etc.  */
   /* Zen can execute 4 integer operations per cycle.  FP operations
      take 3 cycles and it can execute 2 integer additions and 2
      multiplications thus reassociation may make sense up to with of 6.
@@ -1847,6 +1890,9 @@ struct processor_costs znver3_cost = {
   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (5),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (10),			/* cost of 512bit VCVTPS2PD etc.  */
   /* Zen can execute 4 integer operations per cycle.  FP operations
      take 3 cycles and it can execute 2 integer additions and 2
      multiplications thus reassociation may make sense up to with of 6.
@@ -1984,6 +2030,10 @@ struct processor_costs znver4_cost = {
   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (5),			/* cost of 256bit VCVTPS2PD etc.  */
+  /* Real latency is 6, but for split regs multiply cost of half op by 2.  */
+  COSTS_N_INSNS (10),			/* cost of 512bit VCVTPS2PD etc.  */
   /* Zen can execute 4 integer operations per cycle.  FP operations
      take 3 cycles and it can execute 2 integer additions and 2
      multiplications thus reassociation may make sense up to with of 6.
@@ -2135,6 +2185,9 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   /* DIVSD has throughtput 0.13 and latency 20.  */
   COSTS_N_INSNS (20),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (5),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (5),			/* cost of 512bit VCVTPS2PD etc.  */
   /* Zen5 can execute:
       - integer ops: 6 per cycle, at most 3 multiplications.
 	latency 1 for additions, 3 for multiplications (pipelined)
@@ -2274,6 +2327,9 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (2),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (2),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (4),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   skylake_memcpy,
   skylake_memset,
@@ -2403,6 +2459,9 @@ struct processor_costs icelake_cost = {
   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (2),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (2),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (2),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   icelake_memcpy,
   icelake_memset,
@@ -2526,6 +2585,9 @@ struct processor_costs alderlake_cost = {
   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (2),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (2),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (2),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
   alderlake_memcpy,
   alderlake_memset,
@@ -2642,6 +2704,9 @@ const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (7),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   btver1_memcpy,
   btver1_memset,
@@ -2755,6 +2820,9 @@ const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (19),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (7),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   btver2_memcpy,
   btver2_memset,
@@ -2867,6 +2935,9 @@ struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (38),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (10),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (20),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (40),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium4_memcpy,
   pentium4_memset,
@@ -2982,6 +3053,9 @@ struct processor_costs nocona_cost = {
   COSTS_N_INSNS (40),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (10),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (20),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (40),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   nocona_memcpy,
   nocona_memset,
@@ -3095,6 +3169,9 @@ struct processor_costs atom_cost = {
   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (12),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (24),			/* cost of 512bit VCVTPS2PD etc.  */
   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   atom_memcpy,
   atom_memset,
@@ -3208,6 +3285,9 @@ struct processor_costs slm_cost = {
   COSTS_N_INSNS (69),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   slm_memcpy,
   slm_memset,
@@ -3335,6 +3415,9 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
   tremont_memcpy,
   tremont_memset,
@@ -3448,6 +3531,9 @@ struct processor_costs intel_cost = {
   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (8),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (16),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (32),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   intel_memcpy,
   intel_memset,
@@ -3566,6 +3652,9 @@ struct processor_costs lujiazui_cost = {
   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (60),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
   lujiazui_memcpy,
   lujiazui_memset,
@@ -3682,6 +3771,9 @@ struct processor_costs yongfeng_cost = {
   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
   4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
   yongfeng_memcpy,
   yongfeng_memset,
@@ -3798,6 +3890,9 @@ struct processor_costs shijidadao_cost = {
   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (11),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
   4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
   shijidadao_memcpy,
   shijidadao_memset,
@@ -3922,6 +4017,9 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (4),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (5),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
   generic_memcpy,
   generic_memset,
@@ -4051,6 +4149,9 @@ struct processor_costs core_cost = {
   COSTS_N_INSNS (32),			/* cost of DIVSD instruction.  */
   COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (2),			/* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (2),			/* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (2),			/* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   core_memcpy,
   core_memset,
diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index 2a3884c..d0919ec 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -1263,3 +1263,41 @@
   expand_crc_using_clmul (<SUBX:MODE>mode, <SUBX1:MODE>mode, operands);
   DONE;
 })
+
+;; If we have an XOR/IOR with a constant operand (C) and the we can
+;; synthesize ~C more efficiently than C, then synthesize ~C and use
+;; xnor/orn instead.
+;;
+;; The same can be done for AND, but mvconst_internal's issues get in
+;; the way.  That's future work.
+(define_split
+  [(set (match_operand:X 0 "register_operand")
+	(any_or:X (match_operand:X 1 "register_operand")
+		  (match_operand:X 2 "const_int_operand")))
+   (clobber (match_operand:X 3 "register_operand"))]
+  "TARGET_ZBB
+   && (riscv_const_insns (operands[2], true)
+       > riscv_const_insns (GEN_INT (~INTVAL (operands[2])), true))"
+  [(const_int 0)]
+{
+  /* Get the inverted constant into the temporary register.  */
+  riscv_emit_move (operands[3], GEN_INT (~INTVAL (operands[2])));
+
+  /* For xnor, the NOT operation is in a different position.  So
+     we have to customize the split code we generate a bit.
+
+     It is expected that AND will be handled like IOR in the future.  */
+  if (<CODE> == XOR)
+    {
+      rtx x = gen_rtx_XOR (<X:MODE>mode, operands[1], operands[3]);
+      x = gen_rtx_NOT (<X:MODE>mode, x);
+      emit_insn (gen_rtx_SET (operands[0], x));
+    }
+  else
+    {
+      rtx x = gen_rtx_NOT (<X:MODE>mode, operands[3]);
+      x = gen_rtx_IOR (<X:MODE>mode, x, operands[1]);
+      emit_insn (gen_rtx_SET (operands[0], x));
+    }
+  DONE;
+})
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index d3656a7..bad59e2 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -13136,9 +13136,6 @@ parse_features_for_version (tree decl,
 					  DECL_SOURCE_LOCATION (decl));
   gcc_assert (parse_res);
 
-  if (arch_string != default_opts->x_riscv_arch_string)
-    free (CONST_CAST (void *, (const void *) arch_string));
-
   cl_target_option_restore (&global_options, &global_options_set,
 			    &cur_target);
 }
diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog
index 644b36a..6975efb 100644
--- a/gcc/cp/ChangeLog
+++ b/gcc/cp/ChangeLog
@@ -1,3 +1,12 @@
+2025-04-19  Jason Merrill  <jason@redhat.com>
+
+	* coroutines.cc (coro_build_expr_stmt)
+	(coro_build_cvt_void_expr_stmt): Remove.
+	(build_actor_fn): Use finish_expr_stmt.
+	* semantics.cc (finish_expr_stmt): Avoid wrapping statement in
+	EXPR_STMT.
+	(finish_stmt_expr_expr): Add comment.
+
 2025-04-17  Jason Merrill  <jason@redhat.com>
 
 	* constexpr.cc (is_valid_constexpr_fn): Improve diagnostic.
diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog
index 1c45bdb..56325a9 100644
--- a/gcc/fortran/ChangeLog
+++ b/gcc/fortran/ChangeLog
@@ -1,3 +1,10 @@
+2025-04-19  Steven G. Kargl  <kargl@gcc.gnu.org>
+
+	PR fortran/119836
+	* resolve.cc (check_pure_function): Fix checking for
+	an impure subprogram within a DO CONCURRENT construct.
+	(pure_subroutine): Ditto.
+
 2025-04-16  Harald Anlauf  <anlauf@gmx.de>
 
 	PR fortran/106948
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 4cdc9c1..521176a 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,50 @@
+2025-04-20  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/117863
+	* gcc.dg/rtl/i386/vector_eq-2.c: New test.
+	* gcc.dg/rtl/i386/vector_eq-3.c: Likewise.
+
+2025-04-19  Thomas Schwinge  <tschwinge@baylibre.com>
+
+	PR testsuite/119508
+	* rust/compile/nr2/compile.exp: Disable parallel testing.
+
+2025-04-19  Co-authored-by: Jeff Law  <jlaw@ventanamicro.com>
+
+	PR target/118410
+	* gcc.target/riscv/pr118410-1.c: New test.
+	* gcc.target/riscv/pr118410-2.c: Likewise.
+
+2025-04-19  Andrew Pinski  <quic_apinski@quicinc.com>
+
+	* gcc.dg/pr118947-1.c: Use 1025 as the size of the buf.
+	* gcc.dg/pr78408-3.c: Likewise.
+
+2025-04-19  Andrew Pinski  <quic_apinski@quicinc.com>
+
+	PR rtl-optimization/111949
+	* gcc.target/aarch64/bic-1.c: New test.
+
+2025-04-19  Jiaxun Yang  <jiaxun.yang@flygoat.com>
+
+	PR target/111814
+	* gcc.target/sh/pr111814.c: New test.
+
+2025-04-19  Maciej W. Rozycki  <macro@orcam.me.uk>
+
+	* gcc.target/alpha/memcpy-nested-offset-long.c: New file.
+	* gcc.target/alpha/memcpy-nested-offset-quad.c: New file.
+
+2025-04-19  Steven G. Kargl  <kargl@gcc.gnu.org>
+
+	PR fortran/119836
+	* gfortran.dg/do_concurrent_all_clauses.f90: Remove invalid
+	dg-error test.
+	* gfortran.dg/pr119836_1.f90: New test.
+	* gfortran.dg/pr119836_2.f90: New test.
+	* gfortran.dg/pr119836_3.f90: New test.
+	* gfortran.dg/pr119836_4.f90: New test.
+
 2025-04-18  Thomas Schwinge  <tschwinge@baylibre.com>
 
 	PR cobol/119818
diff --git a/gcc/testsuite/gcc.dg/rtl/i386/vector_eq-2.c b/gcc/testsuite/gcc.dg/rtl/i386/vector_eq-2.c
new file mode 100644
index 0000000..871d489
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/rtl/i386/vector_eq-2.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target { i?86-*-* x86_64-*-* } } } */
+/* { dg-additional-options "-O2 -march=x86-64-v3" } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v2di __attribute__((vector_size(16)));
+
+v4si __RTL (startwith ("vregs1")) foo1 (void)
+{
+(function "foo1"
+  (insn-chain
+    (block 2
+      (edge-from entry (flags "FALLTHRU"))
+      (cnote 1 [bb 2] NOTE_INSN_BASIC_BLOCK)
+      (cnote 2 NOTE_INSN_FUNCTION_BEG)
+      (cinsn 3 (set (reg:V4SI <0>) (const_vector:V4SI [(const_int -1) (const_int -1) (const_int -1) (const_int -1)])))
+      (cinsn 4 (set (reg:V4SI <1>) (const_vector:V4SI [(const_int -1) (const_int -1) (const_int -1) (const_int -1)])))
+      (cinsn 5 (set (reg:V4SI <2>)
+		    (eq:V4SI (reg:V4SI <0>) (reg:V4SI <1>))))
+      (cinsn 6 (set (reg:V4SI <3>) (reg:V4SI <2>)))
+      (cinsn 7 (set (reg:V4SI xmm0) (reg:V4SI <3>)))
+      (edge-to exit (flags "FALLTHRU"))
+    )
+  )
+ (crtl (return_rtx (reg/i:V4SI xmm0)))
+)
+}
+
+v8si __RTL (startwith ("vregs1")) foo2 (void)
+{
+(function "foo2"
+  (insn-chain
+    (block 2
+      (edge-from entry (flags "FALLTHRU"))
+      (cnote 1 [bb 2] NOTE_INSN_BASIC_BLOCK)
+      (cnote 2 NOTE_INSN_FUNCTION_BEG)
+      (cinsn 3 (set (reg:V8SI <0>) (const_vector:V8SI [(const_int -1) (const_int -1) (const_int -1) (const_int -1) (const_int -1) (const_int -1) (const_int -1) (const_int -1)])))
+      (cinsn 4 (set (reg:V8SI <1>) (const_vector:V8SI [(const_int -1) (const_int -1) (const_int -1) (const_int -1) (const_int -1) (const_int -1) (const_int -1) (const_int -1)])))
+      (cinsn 5 (set (reg:V8SI <2>)
+		    (eq:V8SI (reg:V8SI <0>) (reg:V8SI <1>))))
+      (cinsn 6 (set (reg:V8SI <3>) (reg:V8SI <2>)))
+      (cinsn 7 (set (reg:V8SI xmm0) (reg:V8SI <3>)))
+      (edge-to exit (flags "FALLTHRU"))
+    )
+  )
+ (crtl (return_rtx (reg/i:V8SI xmm0)))
+)
+}
+
+v2di __RTL (startwith ("vregs1")) foo3 (void)
+{
+(function "foo3"
+  (insn-chain
+    (block 2
+      (edge-from entry (flags "FALLTHRU"))
+      (cnote 1 [bb 2] NOTE_INSN_BASIC_BLOCK)
+      (cnote 2 NOTE_INSN_FUNCTION_BEG)
+      (cinsn 3 (set (reg:V2DI <0>) (const_vector:V2DI [(const_int -1) (const_int -1)])))
+      (cinsn 4 (set (reg:V2DI <1>) (const_vector:V2DI [(const_int -1) (const_int -1)])))
+      (cinsn 5 (set (reg:V2DI <2>)
+		    (eq:V2DI (reg:V2DI <0>) (reg:V2DI <1>))))
+      (cinsn 6 (set (reg:V2DI <3>) (reg:V2DI <2>)))
+      (cinsn 7 (set (reg:V2DI xmm0) (reg:V2DI <3>)))
+      (edge-to exit (flags "FALLTHRU"))
+    )
+  )
+ (crtl (return_rtx (reg/i:V2DI xmm0)))
+)
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeq" 3 } } */
diff --git a/gcc/testsuite/gcc.dg/rtl/i386/vector_eq-3.c b/gcc/testsuite/gcc.dg/rtl/i386/vector_eq-3.c
new file mode 100644
index 0000000..276c4c2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/rtl/i386/vector_eq-3.c
@@ -0,0 +1,74 @@
+/* { dg-do compile { target { i?86-*-* x86_64-*-* } } } */
+/* { dg-additional-options "-O2 -march=x86-64-v3" } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v2di __attribute__((vector_size(16)));
+
+v4si __RTL (startwith ("vregs1")) foo1 (void)
+{
+(function "foo1"
+  (insn-chain
+    (block 2
+      (edge-from entry (flags "FALLTHRU"))
+      (cnote 1 [bb 2] NOTE_INSN_BASIC_BLOCK)
+      (cnote 2 NOTE_INSN_FUNCTION_BEG)
+      (cinsn 3 (set (reg:V4SI <1>)
+		    (mem:V4SI (reg:SI di) [0 ptr S128 A128])))
+      (cinsn 4 (set (reg:V4SI <2>)
+		    (eq:V4SI (reg:V4SI <1>)
+			     (mem:V4SI (reg:SI di) [0 ptr S128 A128]))))
+      (cinsn 5 (set (reg:V4SI <3>) (reg:V4SI <2>)))
+      (cinsn 6 (set (reg:V4SI xmm0) (reg:V4SI <3>)))
+      (edge-to exit (flags "FALLTHRU"))
+    )
+  )
+ (crtl (return_rtx (reg/i:V4SI xmm0)))
+)
+}
+
+v8si __RTL (startwith ("vregs1")) foo2 (void)
+{
+(function "foo2"
+  (insn-chain
+    (block 2
+      (edge-from entry (flags "FALLTHRU"))
+      (cnote 1 [bb 2] NOTE_INSN_BASIC_BLOCK)
+      (cnote 2 NOTE_INSN_FUNCTION_BEG)
+      (cinsn 3 (set (reg:V8SI <1>)
+		    (mem:V8SI (reg:SI di) [0 ptr S256 A256])))
+      (cinsn 4 (set (reg:V8SI <2>)
+		    (eq:V8SI (mem:V8SI (reg:SI di) [0 ptr S256 A256])
+			     (reg:V8SI <1>))))
+      (cinsn 5 (set (reg:V8SI <3>) (reg:V8SI <2>)))
+      (cinsn 6 (set (reg:V8SI xmm0) (reg:V8SI <3>)))
+      (edge-to exit (flags "FALLTHRU"))
+    )
+  )
+ (crtl (return_rtx (reg/i:V8SI xmm0)))
+)
+}
+
+v2di __RTL (startwith ("vregs1")) foo3 (void)
+{
+(function "foo3"
+  (insn-chain
+    (block 2
+      (edge-from entry (flags "FALLTHRU"))
+      (cnote 1 [bb 2] NOTE_INSN_BASIC_BLOCK)
+      (cnote 2 NOTE_INSN_FUNCTION_BEG)
+      (cinsn 3 (set (reg:V2DI <1>)
+		    (mem:V2DI (reg:SI di) [0 ptr S128 A128])))
+      (cinsn 4 (set (reg:V2DI <2>)
+		    (eq:V2DI (reg:V2DI <1>)
+			     (mem:V2DI (reg:SI di) [0 ptr S128 A128]))))
+      (cinsn 5 (set (reg:V2DI <3>) (reg:V2DI <2>)))
+      (cinsn 6 (set (reg:V2DI xmm0) (reg:V2DI <3>)))
+      (edge-to exit (flags "FALLTHRU"))
+    )
+  )
+ (crtl (return_rtx (reg/i:V2DI xmm0)))
+)
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeq" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c
new file mode 100644
index 0000000..ad9e07b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mfma -mavx2" } */
+/* { dg-final { scan-assembler-times {(?n)vfn?m(add|sub)[1-3]*ps} 2 } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+/* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a  */
+
+v4sf
+foo (v4sf a, v4sf b)
+{
+    return a / b;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/pr118410-1.c b/gcc/testsuite/gcc.target/riscv/pr118410-1.c
new file mode 100644
index 0000000..4a8b847
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr118410-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+/* { dg-options "-march=rv64gcb -mabi=lp64d" { target { rv64} } } */
+/* { dg-options "-march=rv32gcb -mabi=ilp32" { target { rv32} } } */
+
+long orlow(long x) { return x | ((1L << 24) - 1); }
+
+/* { dg-final { scan-assembler-times "orn\t" 1 } } */
+/* { dg-final { scan-assembler-not "addi\t" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/pr118410-2.c b/gcc/testsuite/gcc.target/riscv/pr118410-2.c
new file mode 100644
index 0000000..b63a1d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr118410-2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" } } */
+/* { dg-options "-march=rv64gcb -mabi=lp64d" { target { rv64} } } */
+/* { dg-options "-march=rv32gcb -mabi=ilp32" { target { rv32} } } */
+
+long xorlow(long x) { return x ^ ((1L << 24) - 1); }
+
+/* { dg-final { scan-assembler-times "xnor\t" 1 } } */
+/* { dg-final { scan-assembler-not "addi\t" } } */
diff --git a/gcc/testsuite/rust/compile/nr2/compile.exp b/gcc/testsuite/rust/compile/nr2/compile.exp
index 4d91dd0..9e15cdd 100644
--- a/gcc/testsuite/rust/compile/nr2/compile.exp
+++ b/gcc/testsuite/rust/compile/nr2/compile.exp
@@ -19,6 +19,15 @@
 # Load support procs.
 load_lib rust-dg.exp
 
+# These tests don't run runtest_file_p consistently if it
+# doesn't return the same values, so disable parallelization
+# of this *.exp file.  The first parallel runtest to reach
+# this will run all the tests serially.
+if ![gcc_parallel_test_run_p compile] {
+    return
+}
+gcc_parallel_test_enable 0
+
 # Initialize `dg'.
 dg-init
 
@@ -136,3 +145,5 @@ namespace eval rust-nr2-ns {
 
 # All done.
 dg-finish
+
+gcc_parallel_test_enable 1