14 files changed, 378 insertions, 13 deletions
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 31f3ee2..1ba5ac4 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3309,8 +3309,16 @@ ix86_get_vector_load_mode (unsigned int size)
     mode = V64QImode;
   else if (size == 32)
     mode = V32QImode;
-  else
+  else if (size == 16)
     mode = V16QImode;
+  else if (size == 8)
+    mode = V8QImode;
+  else if (size == 4)
+    mode = V4QImode;
+  else if (size == 2)
+    mode = V2QImode;
+  else
+    gcc_unreachable ();
   return mode;
 }
 
@@ -3338,13 +3346,36 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
       if (SUBREG_P (dest) || mode == vector_mode)
 	replace = vector_const;
       else
-	replace = gen_rtx_SUBREG (mode, vector_const, 0);
+	{
+	  unsigned int size = GET_MODE_SIZE (mode);
+	  if (size < ix86_regmode_natural_size (mode))
+	    {
+	      /* If the mode size is smaller than its natural size,
+		 first insert an extra move with a QI vector SUBREG
+		 of the same size to avoid validate_subreg failure.  */
+	      machine_mode vmode = ix86_get_vector_load_mode (size);
+	      rtx vreg;
+	      if (mode == vmode)
+		vreg = vector_const;
+	      else
+		{
+		  vreg = gen_reg_rtx (vmode);
+		  rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0);
+		  rtx pat = gen_rtx_SET (vreg, vsubreg);
+		  rtx_insn *vinsn = emit_insn_before (pat, insn);
+		  df_insn_rescan (vinsn);
+		}
+	      replace = gen_rtx_SUBREG (mode, vreg, 0);
+	    }
+	  else
+	    replace = gen_rtx_SUBREG (mode, vector_const, 0);
+	}
 
-      /* NB: Don't run recog_memoized here since vector SUBREG may not
-	 be valid.  Let LRA handle vector SUBREG.  */
       SET_SRC (set) = replace;
       /* Drop possible dead definitions.  */
       PATTERN (insn) = set;
+      INSN_CODE (insn) = -1;
+      recog_memoized (insn);
       df_insn_rescan (insn);
     }
 }
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f28c92a..bef95ea 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -12320,6 +12320,7 @@ get_thread_pointer (machine_mode tp_mode, bool to_reg)
 
 static GTY(()) rtx ix86_tls_index_symbol;
 
+#if TARGET_WIN32_TLS
 static rtx
 ix86_tls_index (void)
 {
@@ -12331,6 +12332,7 @@ ix86_tls_index (void)
   else
     return ix86_tls_index_symbol;
 }
+#endif
 
 /* Construct the SYMBOL_REF for the tls_get_addr function.  */
 
@@ -22792,6 +22794,27 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
       else
 	*total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
       return false;
+    case FLOAT:
+    case UNSIGNED_FLOAT:
+      if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+	/* TODO: We do not have cost tables for x87.  */
+	*total = cost->fadd;
+      else if (VECTOR_MODE_P (mode))
+	*total = ix86_vec_cost (mode, cost->cvtpi2ps);
+      else
+	*total = cost->cvtsi2ss;
+      return false;
+
+    case FIX:
+    case UNSIGNED_FIX:
+      if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+	/* TODO: We do not have cost tables for x87.  */
+	*total = cost->fadd;
+      else if (VECTOR_MODE_P (mode))
+	*total = ix86_vec_cost (mode, cost->cvtps2pi);
+      else
+	*total = cost->cvtss2si;
+      return false;
 
     case ABS:
       /* SSE requires memory load for the constant operand. It may make
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 02bf357..6a38de3 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -213,6 +213,10 @@ struct processor_costs {
 				   such as VCVTPD2PS with larger reg in ymm.  */
   const int vcvtps2pd512;	/* cost 512bit packed FP conversions,
 				   such as VCVTPD2PS with larger reg in zmm.  */
+  const int cvtsi2ss;		/* cost of CVTSI2SS instruction.  */
+  const int cvtss2si;		/* cost of CVT(T)SS2SI instruction.  */
+  const int cvtpi2ps;		/* cost of CVTPI2PS instruction.  */
+  const int cvtps2pi;		/* cost of CVT(T)PS2PI instruction.  */
   const int reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp;
 				/* Specify reassociation width for integer,
 				   fp, vector integer and vector fp
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index cddcf61..6cce70a 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -134,6 +134,11 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   COSTS_N_BYTES (4),			/* cost of CVTSS2SD etc.  */
   COSTS_N_BYTES (4),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_BYTES (6),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_BYTES (4),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_BYTES (4),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_BYTES (4),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_BYTES (4),			/* cost of CVT(T)PS2PI instruction.  */
+  
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   ix86_size_memcpy,
   ix86_size_memset,
@@ -249,6 +254,10 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (27),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (54),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (108),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (27),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (27),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (27),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (27),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   i386_memcpy,
   i386_memset,
@@ -365,6 +374,10 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (8),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (16),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (32),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (27),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (27),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (27),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (27),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   i486_memcpy,
   i486_memset,
@@ -479,6 +492,10 @@ struct processor_costs pentium_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
@@ -586,6 +603,10 @@ struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (5),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (10),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (20),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (5),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (5),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
@@ -708,6 +729,10 @@ struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
@@ -821,6 +846,10 @@ struct processor_costs geode_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (12),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (24),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   geode_memcpy,
   geode_memset,
@@ -937,6 +966,10 @@ struct processor_costs k6_cost = {
   COSTS_N_INSNS (2),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (4),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (8),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (2),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (2),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   k6_memcpy,
   k6_memset,
@@ -1054,6 +1087,10 @@ struct processor_costs athlon_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (8),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (16),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (4),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   athlon_memcpy,
   athlon_memset,
@@ -1180,6 +1217,10 @@ struct processor_costs k8_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (8),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (16),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (10),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   k8_memcpy,
   k8_memset,
@@ -1314,6 +1355,10 @@ struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (8),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (16),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   amdfam10_memcpy,
   amdfam10_memset,
@@ -1441,6 +1486,10 @@ const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (7),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (14),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (13),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver_memcpy,
   bdver_memset,
@@ -1593,6 +1642,10 @@ struct processor_costs znver1_cost = {
   /* Real latency is 4, but for split regs multiply cost of half op by 2.  */
   COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (8),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (7),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
      and it can execute 2 integer additions and 2 multiplications thus
      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
@@ -1755,6 +1808,10 @@ struct processor_costs znver2_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (5),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (10),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (7),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   /* Zen can execute 4 integer operations per cycle.  FP operations
      take 3 cycles and it can execute 2 integer additions and 2
      multiplications thus reassociation may make sense up to with of 6.
@@ -1893,6 +1950,10 @@ struct processor_costs znver3_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (5),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (10),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   /* Zen can execute 4 integer operations per cycle.  FP operations
      take 3 cycles and it can execute 2 integer additions and 2
      multiplications thus reassociation may make sense up to with of 6.
@@ -2034,6 +2095,10 @@ struct processor_costs znver4_cost = {
   COSTS_N_INSNS (5),			/* cost of 256bit VCVTPS2PD etc.  */
   /* Real latency is 6, but for split regs multiply cost of half op by 2.  */
   COSTS_N_INSNS (10),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   /* Zen can execute 4 integer operations per cycle.  FP operations
      take 3 cycles and it can execute 2 integer additions and 2
      multiplications thus reassociation may make sense up to with of 6.
@@ -2188,6 +2253,10 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (5),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (5),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   /* Zen5 can execute:
       - integer ops: 6 per cycle, at most 3 multiplications.
 	latency 1 for additions, 3 for multiplications (pipelined)
@@ -2330,6 +2399,10 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (2),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (2),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (4),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (7),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   skylake_memcpy,
   skylake_memset,
@@ -2462,6 +2535,10 @@ struct processor_costs icelake_cost = {
   COSTS_N_INSNS (2),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (2),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (2),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   icelake_memcpy,
   icelake_memset,
@@ -2588,6 +2665,10 @@ struct processor_costs alderlake_cost = {
   COSTS_N_INSNS (2),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (2),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (2),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
   alderlake_memcpy,
   alderlake_memset,
@@ -2707,6 +2788,10 @@ const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (7),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (14),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (13),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   btver1_memcpy,
   btver1_memset,
@@ -2823,6 +2908,10 @@ const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (7),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (14),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (13),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   btver2_memcpy,
   btver2_memset,
@@ -2938,6 +3027,10 @@ struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (10),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (20),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (40),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (20),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (17),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (12),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium4_memcpy,
   pentium4_memset,
@@ -3056,6 +3149,10 @@ struct processor_costs nocona_cost = {
   COSTS_N_INSNS (10),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (20),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (40),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (20),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (17),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (12),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   nocona_memcpy,
   nocona_memset,
@@ -3172,6 +3269,10 @@ struct processor_costs atom_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (12),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (24),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (7),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (10),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   atom_memcpy,
   atom_memset,
@@ -3288,6 +3389,10 @@ struct processor_costs slm_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (5),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   slm_memcpy,
   slm_memset,
@@ -3418,6 +3523,10 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
   tremont_memcpy,
   tremont_memset,
@@ -3534,6 +3643,10 @@ struct processor_costs intel_cost = {
   COSTS_N_INSNS (8),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (16),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (32),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (8),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (8),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   intel_memcpy,
   intel_memset,
@@ -3655,6 +3768,10 @@ struct processor_costs lujiazui_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
   lujiazui_memcpy,
   lujiazui_memset,
@@ -3774,6 +3891,10 @@ struct processor_costs yongfeng_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
   yongfeng_memcpy,
   yongfeng_memset,
@@ -3893,6 +4014,10 @@ struct processor_costs shijidadao_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (6),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (12),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (3),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
   shijidadao_memcpy,
   shijidadao_memset,
@@ -4020,6 +4145,10 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (4),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (5),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
   generic_memcpy,
   generic_memset,
@@ -4152,6 +4281,10 @@ struct processor_costs core_cost = {
   COSTS_N_INSNS (2),			/* cost of CVTSS2SD etc.  */
   COSTS_N_INSNS (2),			/* cost of 256bit VCVTPS2PD etc.  */
   COSTS_N_INSNS (2),			/* cost of 512bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (6),			/* cost of CVTSI2SS instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVT(T)SS2SI instruction.  */
+  COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
+  COSTS_N_INSNS (7),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   core_memcpy,
   core_memset,
diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index 0c3b0cc..7cf7e8a 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1673,3 +1673,26 @@
     DONE;
   }
   [(set_attr "type" "vandn")])
+
+
+;; =============================================================================
+;; Combine vec_duplicate + op.vv to op.vx
+;; Include
+;; - vadd.vx
+;; =============================================================================
+(define_insn_and_split "*<optab>_vx_<mode>"
+ [(set (match_operand:V_VLSI    0 "register_operand")
+       (any_int_binop_no_shift_vx:V_VLSI
+	 (vec_duplicate:V_VLSI
+	   (match_operand:<VEL> 1 "register_operand"))
+	 (match_operand:V_VLSI  2 "<binop_rhs2_predicate>")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    rtx ops[] = {operands[0], operands[2], operands[1]};
+    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (<CODE>, <MODE>mode),
+				   riscv_vector::BINARY_OP, ops);
+  }
+  [(set_attr "type" "vialu")])
diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index 20d03dc..95df533 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -1302,3 +1302,77 @@
     }
   DONE;
 })
+
+;; More forms of single bit extraction.  The RISC-V port does not
+;; define SHIFT_COUNT_TRUNCATED so we need forms where the bit position
+;; is masked.
+;;
+;; We could in theory use this for rv32 as well, but it probably does
+;; not occur in practice.  The bit position would need to be QI/HI mode,
+;; otherwise we would not need the zero extension.
+;;
+;; One could also argue that the zero extension is redundant and should
+;; have been optimized away during RTL simplification.
+(define_insn "*bextdi_position_ze_masked"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extract:DI (match_operand:DI 1 "register_operand" "r")
+			 (const_int 1)
+			 (zero_extend:DI
+			  (and:SI (match_operand:SI 2 "register_operand" "r")
+				  (const_int 63)))))]
+  "TARGET_64BIT && TARGET_ZBS"
+  "bext\t%0,%1,%2"
+  [(set_attr "type" "bitmanip")])
+
+;; Same as above, but without the extraneous zero_extend.
+(define_insn "*bextdi_position_ze_masked"
+  [(set (match_operand:X 0 "register_operand" "=r")
+	(zero_extract:X
+	  (match_operand:X 1 "register_operand" "r")
+	  (const_int 1)
+	  (and:X (match_operand:SI 2 "register_operand" "r")
+		 (match_operand:SI 3 "bitpos_mask_operand" "n"))))]
+  "TARGET_64BIT && TARGET_ZBS"
+  "bext\t%0,%1,%2"
+  [(set_attr "type" "bitmanip")])
+
+
+;; Single bit extraction by first shifting it into the sign bit, then
+;; shifting it down to the low bit.
+(define_insn "*bext<mode>_position_masked"
+  [(set (match_operand:X 0 "register_operand" "=r")
+	(lshiftrt:X (ashift:X (match_operand:X 1 "register_operand" "r")
+			      (match_operand:QI 2 "register_operand" "r"))
+		    (match_operand:X 3 "bitpos_mask_operand" "n")))]
+  "TARGET_ZBS"
+  "bext\t%0,%1,%2"
+  [(set_attr "type" "bitmanip")])
+
+;; Single bit extraction by shifting into the low bit, but with the
+;; position formed with a subreg of a mask.
+(define_insn "*bext<mode>_position_masked_subreg"
+  [(set (match_operand:X 0 "register_operand" "=r")
+	(lshiftrt:X
+	 (ashift:X (match_operand:X 1 "register_operand" "r")
+		   (subreg:QI
+		    (and:X (match_operand:X 2 "register_operand" "r")
+			   (match_operand:X 3 "bitpos_mask_operand" "n")) 0))
+	 (match_operand:X 4 "bitpos_mask_operand" "n")))]
+  "TARGET_ZBS"
+  "bext\t%0,%1,%2"
+  [(set_attr "type" "bitmanip")])
+
+;; This has shown up in testing.  In particular we end up with an
+;; immediate input.  We can load that into a register and target
+;; one of the above bext patterns.
+(define_split
+  [(set (match_operand:X 0 "register_operand")
+	(and:X (lshiftrt:X (match_operand 1 "immediate_operand")
+			   (match_operand:QI 2 "register_operand"))
+	       (const_int 1)))
+   (clobber (match_operand:X 3 "register_operand"))]
+  ""
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 0) (zero_extract:X (match_dup 3)
+				      (const_int 1)
+				      (zero_extend:X (match_dup 2))))])
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index f26bafc..c9a638c 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -685,3 +685,7 @@
   (and (match_operand 0 "register_operand")
        (match_test "REGNO (op) == RETURN_ADDR_REGNUM
 		    || REGNO (op) == T0_REGNUM")))
+
+(define_predicate "bitpos_mask_operand"
+  (and (match_code "const_int")
+       (match_test "TARGET_64BIT ? INTVAL (op) == 63 : INTVAL (op) == 31")))
diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
index 26fe228..9766b89 100644
--- a/gcc/config/riscv/riscv-opts.h
+++ b/gcc/config/riscv/riscv-opts.h
@@ -162,4 +162,6 @@ enum riscv_tls_type {
 #define TARGET_VECTOR_AUTOVEC_SEGMENT					       \
   (TARGET_VECTOR && riscv_mautovec_segment)
 
+#define GPR2VR_COST_UNPROVIDED -1
+
 #endif /* ! GCC_RISCV_OPTS_H */
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 2e88990..b0d5bbb 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -836,6 +836,7 @@ struct riscv_tune_info {
 const struct riscv_tune_info *
 riscv_parse_tune (const char *, bool);
 const cpu_vector_cost *get_vector_costs ();
+int get_gr2vr_cost ();
 
 enum
 {
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 167375c..c28eecd 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1121,7 +1121,7 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
     {
     case scalar_to_vec:
       stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
-		    : costs->regmove->GR2VR);
+		    : get_gr2vr_cost ());
       break;
     case vec_to_scalar:
       stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index a065732..3ee88db 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -3863,7 +3863,40 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN
      Cost Model need to be well analyzed and supported in the future. */
   if (riscv_v_ext_mode_p (mode))
     {
-      *total = COSTS_N_INSNS (1);
+      int gr2vr_cost = get_gr2vr_cost ();
+
+      switch (outer_code)
+	{
+	case SET:
+	  {
+	    switch (GET_CODE (x))
+	      {
+	      case VEC_DUPLICATE:
+		*total = gr2vr_cost * COSTS_N_INSNS (1);
+		break;
+	      case PLUS:
+		{
+		  rtx op_0 = XEXP (x, 0);
+		  rtx op_1 = XEXP (x, 1);
+
+		  if (GET_CODE (op_0) == VEC_DUPLICATE
+		      || GET_CODE (op_1) == VEC_DUPLICATE)
+		    *total = (gr2vr_cost + 1) * COSTS_N_INSNS (1);
+		  else
+		    *total = COSTS_N_INSNS (1);
+		}
+		break;
+	      default:
+		*total = COSTS_N_INSNS (1);
+		break;
+	      }
+	  }
+	  break;
+	default:
+	  *total = COSTS_N_INSNS (1);
+	  break;
+	}
+
       return true;
     }
 
@@ -9690,7 +9723,7 @@ riscv_register_move_cost (machine_mode mode,
   if (to == V_REGS)
     {
       if (from_is_gpr)
-	return get_vector_costs ()->regmove->GR2VR;
+	return get_gr2vr_cost ();
       else if (from_is_fpr)
 	return get_vector_costs ()->regmove->FR2VR;
     }
@@ -12540,6 +12573,21 @@ get_vector_costs ()
   return costs;
 }
 
+/* Return the cost of operation that move from gpr to vr.
+   It will take the value of --param=gpr2vr_cost if it is provided.
+   Or the default regmove->GR2VR will be returned.  */
+
+int
+get_gr2vr_cost ()
+{
+  int cost = get_vector_costs ()->regmove->GR2VR;
+
+  if (gpr2vr_cost != GPR2VR_COST_UNPROVIDED)
+    cost = gpr2vr_cost;
+
+  return cost;
+}
+
 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
 
 static int
@@ -12606,7 +12654,7 @@ riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 	{
 	  /* TODO: This is too pessimistic in case we can splat.  */
 	  int regmove_cost = fp ? costs->regmove->FR2VR
-	    : costs->regmove->GR2VR;
+	    : get_gr2vr_cost ();
 	  return (regmove_cost + common_costs->scalar_to_vec_cost)
 	    * estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
 	}
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 15c89ff..259997f 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -3173,15 +3173,25 @@
   "#"
   "&& reload_completed"
   [(set (match_dup 4) (lshiftrt:X (subreg:X (match_dup 2) 0) (match_dup 6)))
-   (set (match_dup 4) (and:X (match_dup 4) (match_dup 7)))
+   (set (match_dup 4) (match_dup 8))
    (set (pc) (if_then_else (match_op_dup 1 [(match_dup 4) (const_int 0)])
 			   (label_ref (match_dup 0)) (pc)))]
 {
-	HOST_WIDE_INT mask = INTVAL (operands[3]);
-	int trailing = ctz_hwi (mask);
+  HOST_WIDE_INT mask = INTVAL (operands[3]);
+  int trailing = ctz_hwi (mask);
+
+  operands[6] = GEN_INT (trailing);
+  operands[7] = GEN_INT (mask >> trailing);
 
-	operands[6] = GEN_INT (trailing);
-	operands[7] = GEN_INT (mask >> trailing);
+  /* This splits after reload, so there's little chance to clean things
+     up.  Rather than emit a ton of RTL here, we can just make a new
+     operand for that RHS and use it.  For the case where the AND would
+     have been redundant, we can make it a NOP move, which does get
+     cleaned up.  */
+  if (operands[7] == CONSTM1_RTX (word_mode))
+    operands[8] = operands[4];
+  else
+    operands[8] = gen_rtx_AND (word_mode, operands[4], operands[7]);
 }
 [(set_attr "type" "branch")])
 
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 7515c8e..9e471be 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -466,6 +466,10 @@ Mask(XCVBI) Var(riscv_xcv_subext)
 TargetVariable
 int riscv_sv_subext
 
+Mask(SVADE) Var(riscv_sv_subext)
+
+Mask(SVADU) Var(riscv_sv_subext)
+
 Mask(SVINVAL) Var(riscv_sv_subext)
 
 Mask(SVNAPOT) Var(riscv_sv_subext)
@@ -579,6 +583,10 @@ Inline strlen calls if possible.
 Target RejectNegative Joined UInteger Var(riscv_strcmp_inline_limit) Init(64)
 Max number of bytes to compare as part of inlined strcmp/strncmp routines (default: 64).
 
+-param=gpr2vr-cost=
+Target RejectNegative Joined UInteger Var(gpr2vr_cost) Init(GPR2VR_COST_UNPROVIDED)
+Set the cost value of the rvv instruction when operate from GPR to VR.
+
 Enum
 Name(rvv_max_lmul) Type(enum rvv_max_lmul_enum)
 The RVV possible LMUL (-mrvv-max-lmul=):
diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md
index b4c86909..eae3340 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -4041,6 +4041,10 @@
   smax umax smin umin mult div udiv mod umod
 ])
 
+(define_code_iterator any_int_binop_no_shift_vx [
+  plus
+])
+
 (define_code_iterator any_int_unop [neg not])
 
 (define_code_iterator any_commutative_binop [plus and ior xor