23 files changed, 1174 insertions, 405 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index a121a18..e7c459d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3223,6 +3223,7 @@
     DONE;
   }
 )
+
 (define_insn "extend<mode><Vwide>2"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
 	(float_extend:<VWIDE>
@@ -3232,6 +3233,29 @@
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
+/* A BF->SF is a shift left of 16, however shifts are expensive and the generic
+   middle-end expansion would force through DI move.  Instead use EXT to do the
+   shift to get better throughput and don't go through GPRs.  */
+
+(define_expand "extendbfsf2"
+  [(set (match_operand:SF 0 "register_operand" "=w")
+	(float_extend:SF
+	  (match_operand:BF 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+{
+  rtx tmp0 = aarch64_gen_shareable_zero (V8BFmode);
+  rtx op0 = force_lowpart_subreg (V8BFmode, operands[1], BFmode);
+  rtx res = gen_reg_rtx (V8BFmode);
+  emit_insn (gen_aarch64_extv8bf (res, tmp0, op0, gen_int_mode (7, SImode)));
+  /* Subregs between floating point modes aren't allowed to change size, so go
+     through V4SFmode.  */
+  res = force_lowpart_subreg (V4SFmode, res, V8BFmode);
+  res = force_lowpart_subreg (SFmode, res, V4SFmode);
+  emit_move_insn (operands[0], res);
+  DONE;
+})
+
+
 ;; Float narrowing operations.
 
 (define_insn "aarch64_float_trunc_rodd_df"
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index a34d2e3..96c183d 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -3128,6 +3128,20 @@
   }
   [(set_attr "type" "mult")])
 
+(define_expand "abs<mode>2"
+  [(set (match_operand:V_INT 0 "register_operand")
+        (abs:V_INT (match_operand:V_INT 1 "register_operand")))]
+  ""
+  {
+    rtx vcc = gen_reg_rtx (DImode);
+    rtx zero = gcn_vec_constant (<MODE>mode, 0);
+    emit_insn (gen_vec_cmp<mode>di (vcc, gen_rtx_LT (VOIDmode, 0, 0),
+				    operands[1], zero));
+    emit_insn (gen_sub<mode>3_exec (operands[0], zero, operands[1],
+				    operands[1], vcc));
+    DONE;
+  })
+
 ;; }}}
 ;; {{{ FP binops - special cases
 
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index dadcf76..ba598a8 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2917,7 +2917,7 @@ ix86_option_override_internal (bool main_args_p,
       else
 	{
 	  opts->x_ix86_move_max = opts->x_prefer_vector_width_type;
-	  if (opts_set->x_ix86_move_max == PVW_NONE)
+	  if (opts->x_ix86_move_max == PVW_NONE)
 	    {
 	      if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
 		opts->x_ix86_move_max = PVW_AVX512;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 8a3e336..b812d8b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4982,7 +4982,7 @@
   "TARGET_64BIT"
   "@
    {cltq|cdqe}
-   movs{lq|x}\t{%1, %0|%0, %1}"
+   movs{lq|xd}\t{%1, %0|%0, %1}"
   [(set_attr "type" "imovx")
    (set_attr "mode" "DI")
    (set_attr "prefix_0f" "0")
@@ -27353,6 +27353,72 @@
 	(match_dup 0))]
   "peep2_reg_dead_p (2, operands[0])"
   [(set (match_dup 2) (match_dup 1))])
+
+;; umax (a, add (a, b)) => [sum, ovf] = add (a, b); ovf ? a : sum
+;; umin (a, add (a, b)) => [sum, ovf] = add (a, b); ovf ? sum : a
+
+(define_code_attr ovf_add_cmp [(umax "geu") (umin "ltu")])
+
+(define_int_iterator ovf_comm [1 2])
+
+(define_insn_and_split "*plus_within_<code><mode>3_<ovf_comm>"
+  [(set (match_operand:SWI248 0 "register_operand")
+	(umaxmin:SWI248
+	  (plus:SWI248 (match_operand:SWI248 1 "nonimmediate_operand")
+		       (match_operand:SWI248 2 "<general_operand>"))
+	  (match_dup ovf_comm)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_CMOVE
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (reg:CCC FLAGS_REG)
+	   (compare:CCC
+	     (plus:SWI248 (match_dup 1) (match_dup 2))
+	     (match_dup ovf_comm)))
+      (set (match_dup 3)
+	   (plus:SWI248 (match_dup 1) (match_dup 2)))])
+   (set (match_dup 0)
+	(if_then_else:SWI248
+	  (<ovf_add_cmp> (reg:CCC FLAGS_REG) (const_int 0))
+	  (match_dup 3)
+	  (match_dup ovf_comm)))]
+{
+  operands[<ovf_comm>] = force_reg (<MODE>mode, operands[<ovf_comm>]);
+  operands[3] = gen_reg_rtx (<MODE>mode);
+})
+
+;; umax (a, sub (a, b)) => [diff, udf] = sub (a, b); udf ? diff : a
+;; umin (a, sub (a, b)) => [diff, udf] = sub (a, b); udf ? a : diff
+
+(define_code_attr udf_sub_cmp [(umax "ltu") (umin "geu")])
+
+(define_insn_and_split "*minus_within_<code><mode>3"
+  [(set (match_operand:SWI248 0 "register_operand")
+	(umaxmin:SWI248
+	  (minus:SWI248 (match_operand:SWI248 1 "nonimmediate_operand")
+			(match_operand:SWI248 2 "<general_operand>"))
+	  (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_CMOVE
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (reg:CC FLAGS_REG)
+	   (compare:CC (match_dup 1) (match_dup 2)))
+      (set (match_dup 3)
+	   (minus:SWI248 (match_dup 1) (match_dup 2)))])
+   (set (match_dup 0)
+	(if_then_else:SWI248
+	  (<udf_sub_cmp> (reg:CC FLAGS_REG) (const_int 0))
+	  (match_dup 3)
+	  (match_dup 1)))]
+{
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+  operands[3] = gen_reg_rtx (<MODE>mode);
+})
 
 ;; Misc patterns (?)
 
@@ -27859,7 +27925,7 @@
 {
   output_asm_insn ("mov{<imodesuffix>}\t{%3, %<k>1|%<k>1, %3}", operands);
   output_asm_insn ("mov{<imodesuffix>}\t{%<k>1, %0|%0, %<k>1}", operands);
-  return "movs{lq|x}\t{%2, %1|%1, %2}";
+  return "movs{lq|xd}\t{%2, %1|%1, %2}";
 }
   [(set_attr "type" "multi")
    (set_attr "length" "24")])
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5eba992..7d91585 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -643,6 +643,9 @@
 (define_mode_iterator VI2_AVX512F
   [(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI])
 
+(define_mode_iterator VI2_AVX10_2
+  [(V32HI "TARGET_AVX10_2") (V16HI "TARGET_AVX2") V8HI])
+
 (define_mode_iterator VI2_AVX512VNNIBW
   [(V32HI "TARGET_AVX512BW || TARGET_AVX512VNNI")
    (V16HI "TARGET_AVX2") V8HI])
@@ -32334,8 +32337,8 @@
 
 (define_expand "usdot_prod<sseunpackmodelower><mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI2_AVX512F 1 "register_operand")
-   (match_operand:VI2_AVX512F 2 "register_operand")
+   (match_operand:VI2_AVX10_2 1 "register_operand")
+   (match_operand:VI2_AVX10_2 2 "register_operand")
    (match_operand:<sseunpackmode> 3 "register_operand")]
   "TARGET_AVXVNNIINT16 || TARGET_AVX10_2"
 {
@@ -32352,8 +32355,8 @@
 
 (define_expand "udot_prod<sseunpackmodelower><mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
-   (match_operand:VI2_AVX512F 1 "register_operand")
-   (match_operand:VI2_AVX512F 2 "register_operand")
+   (match_operand:VI2_AVX10_2 1 "register_operand")
+   (match_operand:VI2_AVX10_2 2 "register_operand")
    (match_operand:<sseunpackmode> 3 "register_operand")]
   "TARGET_AVXVNNIINT16 || TARGET_AVX10_2"
 {
diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in b/gcc/config/loongarch/genopts/loongarch.opt.in
index 39c1545..f0c089a 100644
--- a/gcc/config/loongarch/genopts/loongarch.opt.in
+++ b/gcc/config/loongarch/genopts/loongarch.opt.in
@@ -205,6 +205,10 @@ mmax-inline-memcpy-size=
 Target Joined RejectNegative UInteger Var(la_max_inline_memcpy_size) Init(1024) Save
 -mmax-inline-memcpy-size=SIZE	Set the max size of memcpy to inline, default is 1024.
 
+mbreak-code=
+Target Joined UInteger Var(la_break_code) Init(-1) Save
+-mbreak-code=CODE	Use 'break CODE' for traps supposed to be unrecoverable, or an 'amswap.w' instruction leading to INE if CODE is out of range.
+
 Enum
 Name(explicit_relocs) Type(int)
 The code model option names for -mexplicit-relocs:
diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index eed4d2b..7a91473 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -146,9 +146,6 @@
 ;; Only integer modes equal or larger than a word.
 (define_mode_iterator ILASX_DW  [V4DI V8SI])
 
-;; Only integer modes smaller than a word.
-(define_mode_iterator ILASX_HB  [V16HI V32QI])
-
 ;; Only used for immediate set shuffle elements instruction.
 (define_mode_iterator LASX_WHB_W [V8SI V16HI V32QI V8SF])
 
@@ -834,59 +831,6 @@
   [(set_attr "type" "simd_div")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "xor<mode>3"
-  [(set (match_operand:LASX 0 "register_operand" "=f,f,f")
-	(xor:LASX
-	  (match_operand:LASX 1 "register_operand" "f,f,f")
-	  (match_operand:LASX 2 "reg_or_vector_same_val_operand" "f,YC,Urv8")))]
-  "ISA_HAS_LASX"
-  "@
-   xvxor.v\t%u0,%u1,%u2
-   xvbitrevi.%v0\t%u0,%u1,%V2
-   xvxori.b\t%u0,%u1,%B2"
-  [(set_attr "type" "simd_logic,simd_bit,simd_logic")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "ior<mode>3"
-  [(set (match_operand:LASX 0 "register_operand" "=f,f,f")
-	(ior:LASX
-	  (match_operand:LASX 1 "register_operand" "f,f,f")
-	  (match_operand:LASX 2 "reg_or_vector_same_val_operand" "f,YC,Urv8")))]
-  "ISA_HAS_LASX"
-  "@
-   xvor.v\t%u0,%u1,%u2
-   xvbitseti.%v0\t%u0,%u1,%V2
-   xvori.b\t%u0,%u1,%B2"
-  [(set_attr "type" "simd_logic,simd_bit,simd_logic")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "and<mode>3"
-  [(set (match_operand:LASX 0 "register_operand" "=f,f,f")
-	(and:LASX
-	  (match_operand:LASX 1 "register_operand" "f,f,f")
-	  (match_operand:LASX 2 "reg_or_vector_same_val_operand" "f,YZ,Urv8")))]
-  "ISA_HAS_LASX"
-{
-  switch (which_alternative)
-    {
-    case 0:
-      return "xvand.v\t%u0,%u1,%u2";
-    case 1:
-      {
-	rtx elt0 = CONST_VECTOR_ELT (operands[2], 0);
-	unsigned HOST_WIDE_INT val = ~UINTVAL (elt0);
-	operands[2] = loongarch_gen_const_int_vector (<MODE>mode, val & (-val));
-	return "xvbitclri.%v0\t%u0,%u1,%V2";
-      }
-    case 2:
-      return "xvandi.b\t%u0,%u1,%B2";
-    default:
-      gcc_unreachable ();
-    }
-}
-  [(set_attr "type" "simd_logic,simd_bit,simd_logic")
-   (set_attr "mode" "<MODE>")])
-
 (define_insn "one_cmpl<mode>2"
   [(set (match_operand:ILASX 0 "register_operand" "=f")
 	(not:ILASX (match_operand:ILASX 1 "register_operand" "f")))]
@@ -1035,16 +979,6 @@
   [(set_attr "type" "simd_fmadd")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "fnma<mode>4"
-  [(set (match_operand:FLASX 0 "register_operand" "=f")
-	(fma:FLASX (neg:FLASX (match_operand:FLASX 1 "register_operand" "f"))
-		   (match_operand:FLASX 2 "register_operand" "f")
-		   (match_operand:FLASX 3 "register_operand" "0")))]
-  "ISA_HAS_LASX"
-  "xvfnmsub.<flasxfmt>\t%u0,%u1,%u2,%u0"
-  [(set_attr "type" "simd_fmadd")
-   (set_attr "mode" "<MODE>")])
-
 (define_expand "sqrt<mode>2"
   [(set (match_operand:FLASX 0 "register_operand")
     (sqrt:FLASX (match_operand:FLASX 1 "register_operand")))]
@@ -3633,69 +3567,38 @@
   [(set_attr "type" "simd_store")
    (set_attr "mode" "DI")])
 
-(define_expand "vec_widen_<su>add_hi_<mode>"
+(define_expand "vec_widen_<su><optab>_<hi_lo>_<mode>"
   [(match_operand:<VDMODE256> 0 "register_operand")
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 1 "register_operand"))
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 2 "register_operand"))]
-  "ISA_HAS_LASX"
-{
-  loongarch_expand_vec_widen_hilo (operands[0], operands[1], operands[2],
-                        <u_bool>, true, "add");
-  DONE;
-})
-
-(define_expand "vec_widen_<su>add_lo_<mode>"
-  [(match_operand:<VDMODE256> 0 "register_operand")
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 1 "register_operand"))
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 2 "register_operand"))]
-  "ISA_HAS_LASX"
-{
-  loongarch_expand_vec_widen_hilo (operands[0], operands[1], operands[2],
-                        <u_bool>, false, "add");
-  DONE;
-})
-
-(define_expand "vec_widen_<su>sub_hi_<mode>"
-  [(match_operand:<VDMODE256> 0 "register_operand")
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 1 "register_operand"))
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 2 "register_operand"))]
-  "ISA_HAS_LASX"
-{
-  loongarch_expand_vec_widen_hilo (operands[0], operands[1], operands[2],
-                        <u_bool>, true, "sub");
-  DONE;
-})
-
-(define_expand "vec_widen_<su>sub_lo_<mode>"
-  [(match_operand:<VDMODE256> 0 "register_operand")
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 1 "register_operand"))
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 2 "register_operand"))]
-  "ISA_HAS_LASX"
-{
-  loongarch_expand_vec_widen_hilo (operands[0], operands[1], operands[2],
-                        <u_bool>, false, "sub");
-  DONE;
-})
-
-(define_expand "vec_widen_<su>mult_hi_<mode>"
-  [(match_operand:<VDMODE256> 0 "register_operand")
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 1 "register_operand"))
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 2 "register_operand"))]
+   (match_operand:ILASX_WHB 1 "register_operand")
+   (match_operand:ILASX_WHB 2 "register_operand")
+   (any_extend (const_int 0))
+   (addsub (const_int 0) (const_int 0))
+   (const_int zero_one)]
   "ISA_HAS_LASX"
 {
+  rtx (*fn_even) (rtx, rtx, rtx) =
+gen_lasx_xv<optab>wev_<dlasxfmt>_<lasxfmt><u>;
+  rtx (*fn_odd) (rtx, rtx, rtx) =
+gen_lasx_xv<optab>wod_<dlasxfmt>_<lasxfmt><u>;
   loongarch_expand_vec_widen_hilo (operands[0], operands[1], operands[2],
-                        <u_bool>, true, "mult");
+                        <zero_one>, fn_even, fn_odd);
   DONE;
 })
 
-(define_expand "vec_widen_<su>mult_lo_<mode>"
+(define_expand "vec_widen_<su>mult_<hi_lo>_<mode>"
   [(match_operand:<VDMODE256> 0 "register_operand")
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 1 "register_operand"))
-   (any_extend:<VDMODE256> (match_operand:ILASX_HB 2 "register_operand"))]
+   (match_operand:ILASX_WHB 1 "register_operand")
+   (match_operand:ILASX_WHB 2 "register_operand")
+   (any_extend (const_int 0))
+   (const_int zero_one)]
   "ISA_HAS_LASX"
 {
+  rtx (*fn_even) (rtx, rtx, rtx) =
+gen_lasx_xvmulwev_<dlasxfmt>_<lasxfmt><u>;
+  rtx (*fn_odd) (rtx, rtx, rtx) =
+gen_lasx_xvmulwod_<dlasxfmt>_<lasxfmt><u>;
   loongarch_expand_vec_widen_hilo (operands[0], operands[1], operands[2],
-                        <u_bool>, false, "mult");
+                        <zero_one>, fn_even, fn_odd);
   DONE;
 })
 
diff --git a/gcc/config/loongarch/loongarch-protos.h b/gcc/config/loongarch/loongarch-protos.h
index 6ecbe27..bec4368 100644
--- a/gcc/config/loongarch/loongarch-protos.h
+++ b/gcc/config/loongarch/loongarch-protos.h
@@ -198,7 +198,8 @@ extern void loongarch_register_frame_header_opt (void);
 extern void loongarch_expand_vec_cond_expr (machine_mode, machine_mode, rtx *);
 extern void loongarch_expand_vec_cond_mask_expr (machine_mode, machine_mode,
 						 rtx *);
-extern void loongarch_expand_vec_widen_hilo (rtx, rtx, rtx, bool, bool, const char *);
+extern void loongarch_expand_vec_widen_hilo (rtx, rtx, rtx, bool,
+	rtx (*)(rtx, rtx, rtx), rtx (*)(rtx, rtx, rtx));
 
 /* Routines implemented in loongarch-c.c.  */
 void loongarch_cpu_cpp_builtins (cpp_reader *);
@@ -217,7 +218,8 @@ extern void loongarch_emit_swdivsf (rtx, rtx, rtx, machine_mode);
 extern bool loongarch_explicit_relocs_p (enum loongarch_symbol_type);
 extern bool loongarch_symbol_extreme_p (enum loongarch_symbol_type);
 extern bool loongarch_option_valid_attribute_p (tree, tree, tree, int);
-extern void loongarch_option_override_internal (struct loongarch_target *, struct gcc_options *, struct gcc_options *);
+extern void loongarch_option_override_internal (struct loongarch_target *,
+	struct gcc_options *, struct gcc_options *);
 extern void loongarch_reset_previous_fndecl (void);
 extern void loongarch_save_restore_target_globals (tree new_tree);
 extern void loongarch_register_pragmas (void);
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index c782cac..f7ce3aa 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -1718,14 +1718,36 @@ loongarch_symbol_binds_local_p (const_rtx x)
 bool
 loongarch_const_vector_bitimm_set_p (rtx op, machine_mode mode)
 {
-  if (GET_CODE (op) == CONST_VECTOR && op != CONST0_RTX (mode))
+  if (GET_CODE (op) == CONST_VECTOR
+      && (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
+	  || GET_MODE_CLASS (mode) == MODE_VECTOR_INT))
     {
-      unsigned HOST_WIDE_INT val = UINTVAL (CONST_VECTOR_ELT (op, 0));
+      unsigned HOST_WIDE_INT val;
+
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+	{
+	  rtx val_s = CONST_VECTOR_ELT (op, 0);
+	  const REAL_VALUE_TYPE *x = CONST_DOUBLE_REAL_VALUE (val_s);
+	  if (GET_MODE (val_s) == DFmode)
+	    {
+	      long tmp[2];
+	      REAL_VALUE_TO_TARGET_DOUBLE (*x, tmp);
+	      val = (unsigned HOST_WIDE_INT) tmp[1] << 32 | tmp[0];
+	    }
+	  else
+	    {
+	      long tmp;
+	      REAL_VALUE_TO_TARGET_SINGLE (*x, tmp);
+	      val = (unsigned HOST_WIDE_INT) tmp;
+	    }
+	}
+      else
+	val = UINTVAL (CONST_VECTOR_ELT (op, 0));
+
       int vlog2 = exact_log2 (val & GET_MODE_MASK (GET_MODE_INNER (mode)));
 
       if (vlog2 != -1)
 	{
-	  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
 	  gcc_assert (vlog2 >= 0 && vlog2 <= GET_MODE_UNIT_BITSIZE (mode) - 1);
 	  return loongarch_const_vector_same_val_p (op, mode);
 	}
@@ -1740,14 +1762,35 @@ loongarch_const_vector_bitimm_set_p (rtx op, machine_mode mode)
 bool
 loongarch_const_vector_bitimm_clr_p (rtx op, machine_mode mode)
 {
-  if (GET_CODE (op) == CONST_VECTOR && op != CONSTM1_RTX (mode))
+  if (GET_CODE (op) == CONST_VECTOR
+      && (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
+	  || GET_MODE_CLASS (mode) == MODE_VECTOR_INT))
     {
-      unsigned HOST_WIDE_INT val = ~UINTVAL (CONST_VECTOR_ELT (op, 0));
+      unsigned HOST_WIDE_INT val;
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+	{
+	  rtx val_s = CONST_VECTOR_ELT (op, 0);
+	  const REAL_VALUE_TYPE *x = CONST_DOUBLE_REAL_VALUE (val_s);
+	  if (GET_MODE (val_s) == DFmode)
+	    {
+	      long tmp[2];
+	      REAL_VALUE_TO_TARGET_DOUBLE (*x, tmp);
+	      val = ~((unsigned HOST_WIDE_INT) tmp[1] << 32 | tmp[0]);
+	    }
+	  else
+	    {
+	      long tmp;
+	      REAL_VALUE_TO_TARGET_SINGLE (*x, tmp);
+	      val = ~((unsigned HOST_WIDE_INT) tmp);
+	    }
+	}
+      else
+	val = ~UINTVAL (CONST_VECTOR_ELT (op, 0));
+
       int vlog2 = exact_log2 (val & GET_MODE_MASK (GET_MODE_INNER (mode)));
 
       if (vlog2 != -1)
 	{
-	  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
 	  gcc_assert (vlog2 >= 0 && vlog2 <= GET_MODE_UNIT_BITSIZE (mode) - 1);
 	  return loongarch_const_vector_same_val_p (op, mode);
 	}
@@ -4056,6 +4099,17 @@ loongarch_rtx_costs (rtx x, machine_mode mode, int outer_code,
 	*total = loongarch_cost->int_mult_di;
       else
 	*total = loongarch_cost->int_mult_si;
+
+      /* Check for mul_widen.  */
+      if ((GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
+	   && GET_CODE (XEXP (x, 1)) == SIGN_EXTEND)
+	  || (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
+	      && GET_CODE (XEXP (x, 1)) == ZERO_EXTEND))
+	{
+	  *total += (set_src_cost (XEXP (XEXP (x, 0), 0), mode, speed)
+		     + set_src_cost (XEXP (XEXP (x, 1), 0), mode, speed));
+	  return true;
+	}
       return false;
 
     case DIV:
@@ -5479,12 +5533,32 @@ loongarch_expand_conditional_move (rtx *operands)
 	    }
 	}
 
+      auto is_binary_op_0_keep_orig = [](enum rtx_code code)
+	{
+	  switch (code)
+	    {
+	    case PLUS:
+	    case MINUS:
+	    case IOR:
+	    case XOR:
+	    case ROTATE:
+	    case ROTATERT:
+	    case ASHIFT:
+	    case ASHIFTRT:
+	    case LSHIFTRT:
+	      return true;
+	    default:
+	      return false;
+	    }
+	};
+
       /* Check if the optimization conditions are met.  */
       if (value_if_true_insn
 	  && value_if_false_insn
-	  /* Make sure that value_if_false and var are the same.  */
-	  && BINARY_P (value_if_true_insn_src
-		       = SET_SRC (single_set (value_if_true_insn)))
+	  /* Make sure that the orig value OP 0 keep orig.  */
+	  && (value_if_true_insn_src
+	      = SET_SRC (single_set (value_if_true_insn)))
+	  && is_binary_op_0_keep_orig ( GET_CODE (value_if_true_insn_src))
 	  /* Make sure that both value_if_true and value_if_false
 	     has the same var.  */
 	  && rtx_equal_p (XEXP (value_if_true_insn_src, 0),
@@ -6439,7 +6513,28 @@ loongarch_print_operand (FILE *file, rtx op, int letter)
       if (CONST_VECTOR_P (op))
 	{
 	  machine_mode mode = GET_MODE_INNER (GET_MODE (op));
-	  unsigned HOST_WIDE_INT val = UINTVAL (CONST_VECTOR_ELT (op, 0));
+	  rtx val_s = CONST_VECTOR_ELT (op, 0);
+	  unsigned HOST_WIDE_INT val;
+
+	  if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+	    {
+	      const REAL_VALUE_TYPE *x = CONST_DOUBLE_REAL_VALUE (val_s);
+	      if (GET_MODE (val_s) == DFmode)
+		{
+		  long tmp[2];
+		  REAL_VALUE_TO_TARGET_DOUBLE (*x, tmp);
+		  val = (unsigned HOST_WIDE_INT) (tmp[1] << 32 | tmp[0]);
+		}
+	      else
+		{
+		  long tmp;
+		  REAL_VALUE_TO_TARGET_SINGLE (*x, tmp);
+		  val = (unsigned HOST_WIDE_INT) tmp;
+		}
+	    }
+	  else
+	    val = UINTVAL (val_s);
+
 	  int vlog2 = exact_log2 (val & GET_MODE_MASK (mode));
 	  if (vlog2 != -1)
 	    fprintf (file, "%d", vlog2);
@@ -8808,105 +8903,22 @@ loongarch_expand_vec_interleave (rtx target, rtx op0, rtx op1, bool high_p)
 
 void
 loongarch_expand_vec_widen_hilo (rtx dest, rtx op1, rtx op2,
-				 bool uns_p, bool high_p, const char *optab)
+				 bool high_p, rtx (*fn_even) (rtx, rtx, rtx),
+				 rtx (*fn_odd) (rtx, rtx, rtx))
 {
   machine_mode wmode = GET_MODE (dest);
   machine_mode mode = GET_MODE (op1);
-  rtx t1, t2, t3;
-
-  t1 = gen_reg_rtx (wmode);
-  t2 = gen_reg_rtx (wmode);
-  t3 = gen_reg_rtx (wmode);
-  switch (mode)
-    {
-    case V16HImode:
-      if (!strcmp (optab, "add"))
-	{
-	  if (!uns_p)
-	    {
-	      emit_insn (gen_lasx_xvaddwev_w_h (t1, op1, op2));
-	      emit_insn (gen_lasx_xvaddwod_w_h (t2, op1, op2));
-	    }
-	  else
-	    {
-	      emit_insn (gen_lasx_xvaddwev_w_hu (t1, op1, op2));
-	      emit_insn (gen_lasx_xvaddwod_w_hu (t2, op1, op2));
-	    }
-	}
-      else if (!strcmp (optab, "mult"))
-	{
-	  if (!uns_p)
-	    {
-	      emit_insn (gen_lasx_xvmulwev_w_h (t1, op1, op2));
-	      emit_insn (gen_lasx_xvmulwod_w_h (t2, op1, op2));
-	    }
-	  else
-	    {
-	      emit_insn (gen_lasx_xvmulwev_w_hu (t1, op1, op2));
-	      emit_insn (gen_lasx_xvmulwod_w_hu (t2, op1, op2));
-	    }
-	}
-      else if (!strcmp (optab, "sub"))
-	{
-	  if (!uns_p)
-	    {
-	      emit_insn (gen_lasx_xvsubwev_w_h (t1, op1, op2));
-	      emit_insn (gen_lasx_xvsubwod_w_h (t2, op1, op2));
-	    }
-	  else
-	    {
-	      emit_insn (gen_lasx_xvsubwev_w_hu (t1, op1, op2));
-	      emit_insn (gen_lasx_xvsubwod_w_hu (t2, op1, op2));
-	    }
-	}
-      break;
 
-    case V32QImode:
-      if (!strcmp (optab, "add"))
-	{
-	  if (!uns_p)
-	    {
-	      emit_insn (gen_lasx_xvaddwev_h_b (t1, op1, op2));
-	      emit_insn (gen_lasx_xvaddwod_h_b (t2, op1, op2));
-	    }
-	  else
-	    {
-	      emit_insn (gen_lasx_xvaddwev_h_bu (t1, op1, op2));
-	      emit_insn (gen_lasx_xvaddwod_h_bu (t2, op1, op2));
-	    }
-	}
-      else if (!strcmp (optab, "mult"))
-	{
-	  if (!uns_p)
-	    {
-	      emit_insn (gen_lasx_xvmulwev_h_b (t1, op1, op2));
-	      emit_insn (gen_lasx_xvmulwod_h_b (t2, op1, op2));
-	    }
-	  else
-	    {
-	      emit_insn (gen_lasx_xvmulwev_h_bu (t1, op1, op2));
-	      emit_insn (gen_lasx_xvmulwod_h_bu (t2, op1, op2));
-	    }
-	}
-      else if (!strcmp (optab, "sub"))
-	{
-	  if (!uns_p)
-	    {
-	      emit_insn (gen_lasx_xvsubwev_h_b (t1, op1, op2));
-	      emit_insn (gen_lasx_xvsubwod_h_b (t2, op1, op2));
-	    }
-	  else
-	    {
-	      emit_insn (gen_lasx_xvsubwev_h_bu (t1, op1, op2));
-	      emit_insn (gen_lasx_xvsubwod_h_bu (t2, op1, op2));
-	    }
-	}
-      break;
+  gcc_assert (ISA_HAS_LASX
+	      && GET_MODE_SIZE (mode) == 32
+	      && mode != V4DImode);
 
-    default:
-      gcc_unreachable ();
-    }
+  rtx t1 = gen_reg_rtx (wmode);
+  rtx t2 = gen_reg_rtx (wmode);
+  rtx t3 = gen_reg_rtx (wmode);
 
+  emit_insn (fn_even (t1, op1, op2));
+  emit_insn (fn_odd (t2, op1, op2));
   loongarch_expand_vec_interleave (t3, t1, t2, high_p);
   emit_move_insn (dest, gen_lowpart (wmode, t3));
 }
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index a275a2d..625f30c 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -679,14 +679,22 @@
 ;;  ....................
 ;;
 
-(define_insn "trap"
-  [(trap_if (const_int 1) (const_int 0))]
+(define_insn "*trap"
+  [(trap_if (const_int 1) (match_operand 0 "const_int_operand"))]
   ""
 {
-  return "break\t0";
+  return (const_uimm15_operand (operands[0], VOIDmode)
+	  ? "break\t%0"
+	  : "amswap.w\t$r0,$r1,$r0");
 }
   [(set_attr "type" "trap")])
 
+(define_expand "trap"
+  [(trap_if (const_int 1) (match_dup 0))]
+  ""
+{
+  operands[0] = GEN_INT (la_break_code);
+})
 
 
 ;;
@@ -2523,6 +2531,38 @@
   [(set_attr "type" "condmove")
    (set_attr "mode" "<GPR:MODE>")])
 
+(define_insn_and_split "both_non_zero"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(and:DI (ne:DI (match_operand:DI 1 "register_operand" "r")
+		       (const_int 0))
+		(ne:DI (match_operand:DI 2 "register_operand" "r")
+		       (const_int 0))))]
+  "TARGET_64BIT"
+  "#"
+  "&& true"
+  [(set (match_dup 0)
+	(ne:DI (match_dup 1) (const_int 0)))
+   (set (match_dup 0)
+	(if_then_else:DI (ne:DI (match_dup 2) (const_int 0))
+			 (match_dup 0)
+			 (const_int 0)))])
+
+(define_insn_and_split "both_non_zero_subreg"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(and:DI (subreg:DI (ne:SI (match_operand:DI 1 "register_operand" "r")
+				  (const_int 0)) 0)
+		(subreg:DI (ne:SI (match_operand:DI 2 "register_operand" "r")
+				  (const_int 0)) 0)))]
+  "TARGET_64BIT"
+  "#"
+  "&& true"
+  [(set (match_dup 0)
+	(ne:DI (match_dup 1) (const_int 0)))
+   (set (match_dup 0)
+	(if_then_else:DI (ne:DI (match_dup 2) (const_int 0))
+			 (match_dup 0)
+			 (const_int 0)))])
+
 ;; fsel copies the 3rd argument when the 1st is non-zero and the 2nd
 ;; argument if the 1st is zero.  This means operand 2 and 3 are
 ;; inverted in the instruction.
@@ -3041,6 +3081,16 @@
   [(set_attr "type" "shift")
    (set_attr "mode" "SI")])
 
+(define_insn "sign_extend_ashift<GPR:mode><SHORT:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=r")
+	(ashift:GPR
+	   (sign_extend:GPR (match_operand:SHORT 1 "register_operand" "r"))
+	   (match_operand:SI 2 "const_uimm5_operand")))]
+  "(GET_MODE_BITSIZE (<SHORT:MODE>mode) + INTVAL (operands[2])) == 32"
+  "slli.w\t%0,%1,%2"
+  [(set_attr "type" "shift")
+   (set_attr "mode" "<GPR:MODE>")])
+
 (define_insn "*rotr<mode>3"
   [(set (match_operand:GPR 0 "register_operand" "=r,r")
 	(rotatert:GPR (match_operand:GPR 1 "register_operand" "r,r")
diff --git a/gcc/config/loongarch/loongarch.opt b/gcc/config/loongarch/loongarch.opt
index fbe61c0..628eabe 100644
--- a/gcc/config/loongarch/loongarch.opt
+++ b/gcc/config/loongarch/loongarch.opt
@@ -213,6 +213,10 @@ mmax-inline-memcpy-size=
 Target Joined RejectNegative UInteger Var(la_max_inline_memcpy_size) Init(1024) Save
 -mmax-inline-memcpy-size=SIZE	Set the max size of memcpy to inline, default is 1024.
 
+mbreak-code=
+Target Joined UInteger Var(la_break_code) Init(-1) Save
+-mbreak-code=CODE	Use 'break CODE' for traps supposed to be unrecoverable, or an 'amswap.w' instruction leading to INE if CODE is out of range.
+
 Enum
 Name(explicit_relocs) Type(int)
 The code model option names for -mexplicit-relocs:
diff --git a/gcc/config/loongarch/loongarch.opt.urls b/gcc/config/loongarch/loongarch.opt.urls
index 606a211..c93f046 100644
--- a/gcc/config/loongarch/loongarch.opt.urls
+++ b/gcc/config/loongarch/loongarch.opt.urls
@@ -48,6 +48,9 @@ UrlSuffix(gcc/LoongArch-Options.html#index-mstrict-align-1)
 mmax-inline-memcpy-size=
 UrlSuffix(gcc/LoongArch-Options.html#index-mmax-inline-memcpy-size)
 
+mbreak-code=
+UrlSuffix(gcc/LoongArch-Options.html#index-mbreak-code)
+
 mexplicit-relocs=
 UrlSuffix(gcc/LoongArch-Options.html#index-mexplicit-relocs-1)
 
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index fb0236b..cd87757 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -145,9 +145,6 @@
 ;; Only integer modes equal or larger than a word.
 (define_mode_iterator ILSX_DW  [V2DI V4SI])
 
-;; Only integer modes smaller than a word.
-(define_mode_iterator ILSX_HB  [V8HI V16QI])
-
 ;;;; Only integer modes for fixed-point madd_q/maddr_q.
 ;;(define_mode_iterator ILSX_WH  [V4SI V8HI])
 
@@ -654,59 +651,6 @@
   [(set_attr "type" "simd_div")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "xor<mode>3"
-  [(set (match_operand:LSX 0 "register_operand" "=f,f,f")
-	(xor:LSX
-	  (match_operand:LSX 1 "register_operand" "f,f,f")
-	  (match_operand:LSX 2 "reg_or_vector_same_val_operand" "f,YC,Urv8")))]
-  "ISA_HAS_LSX"
-  "@
-   vxor.v\t%w0,%w1,%w2
-   vbitrevi.%v0\t%w0,%w1,%V2
-   vxori.b\t%w0,%w1,%B2"
-  [(set_attr "type" "simd_logic,simd_bit,simd_logic")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "ior<mode>3"
-  [(set (match_operand:LSX 0 "register_operand" "=f,f,f")
-	(ior:LSX
-	  (match_operand:LSX 1 "register_operand" "f,f,f")
-	  (match_operand:LSX 2 "reg_or_vector_same_val_operand" "f,YC,Urv8")))]
-  "ISA_HAS_LSX"
-  "@
-   vor.v\t%w0,%w1,%w2
-   vbitseti.%v0\t%w0,%w1,%V2
-   vori.b\t%w0,%w1,%B2"
-  [(set_attr "type" "simd_logic,simd_bit,simd_logic")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "and<mode>3"
-  [(set (match_operand:LSX 0 "register_operand" "=f,f,f")
-	(and:LSX
-	  (match_operand:LSX 1 "register_operand" "f,f,f")
-	  (match_operand:LSX 2 "reg_or_vector_same_val_operand" "f,YZ,Urv8")))]
-  "ISA_HAS_LSX"
-{
-  switch (which_alternative)
-    {
-    case 0:
-      return "vand.v\t%w0,%w1,%w2";
-    case 1:
-      {
-	rtx elt0 = CONST_VECTOR_ELT (operands[2], 0);
-	unsigned HOST_WIDE_INT val = ~UINTVAL (elt0);
-	operands[2] = loongarch_gen_const_int_vector (<MODE>mode, val & (-val));
-	return "vbitclri.%v0\t%w0,%w1,%V2";
-      }
-    case 2:
-      return "vandi.b\t%w0,%w1,%B2";
-    default:
-      gcc_unreachable ();
-    }
-}
-  [(set_attr "type" "simd_logic,simd_bit,simd_logic")
-   (set_attr "mode" "<MODE>")])
-
 (define_insn "one_cmpl<mode>2"
   [(set (match_operand:ILSX 0 "register_operand" "=f")
 	(not:ILSX (match_operand:ILSX 1 "register_operand" "f")))]
@@ -852,16 +796,6 @@
   [(set_attr "type" "simd_fmadd")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "fnma<mode>4"
-  [(set (match_operand:FLSX 0 "register_operand" "=f")
-	(fma:FLSX (neg:FLSX (match_operand:FLSX 1 "register_operand" "f"))
-		  (match_operand:FLSX 2 "register_operand" "f")
-		  (match_operand:FLSX 3 "register_operand" "0")))]
-  "ISA_HAS_LSX"
-  "vfnmsub.<flsxfmt>\t%w0,%w1,%w2,%w0"
-  [(set_attr "type" "simd_fmadd")
-   (set_attr "mode" "<MODE>")])
-
 (define_expand "sqrt<mode>2"
   [(set (match_operand:FLSX 0 "register_operand")
     (sqrt:FLSX (match_operand:FLSX 1 "register_operand")))]
@@ -3220,3 +3154,48 @@
   [(set (match_dup 0)
 	(vec_duplicate:V2DI (match_dup 1)))]
   "")
+
+(define_expand "vec_widen_<su><optab>_<hi_lo>_<mode>"
+  [(match_operand:<VDMODE> 0 "register_operand")
+   (match_operand:ILSX_WHB 1 "register_operand")
+   (match_operand:ILSX_WHB 2 "register_operand")
+   (any_extend (const_int 0))
+   (addsub (const_int 0) (const_int 0))
+   (const_int zero_one)]
+  "ISA_HAS_LSX"
+{
+  rtx t_even = gen_reg_rtx (<VDMODE>mode);
+  rtx t_odd = gen_reg_rtx (<VDMODE>mode);
+  emit_insn (gen_lsx_v<optab>wev_<dlsxfmt>_<lsxfmt><u> (t_even, operands[1],
+	operands[2]));
+  emit_insn (gen_lsx_v<optab>wod_<dlsxfmt>_<lsxfmt><u> (t_odd, operands[1],
+	operands[2]));
+  if (<zero_one>)
+    emit_insn (gen_lsx_vilvh_<dlsxfmt> (operands[0], t_even, t_odd));
+  else
+    emit_insn (gen_lsx_vilvl_<dlsxfmt> (operands[0], t_even, t_odd));
+
+  DONE;
+})
+
+(define_expand "vec_widen_<su>mult_<hi_lo>_<mode>"
+  [(match_operand:<VDMODE> 0 "register_operand")
+   (match_operand:ILSX_WHB 1 "register_operand")
+   (match_operand:ILSX_WHB 2 "register_operand")
+   (any_extend (const_int 0))
+   (const_int zero_one)]
+  "ISA_HAS_LSX"
+{
+  rtx t_even = gen_reg_rtx (<VDMODE>mode);
+  rtx t_odd = gen_reg_rtx (<VDMODE>mode);
+  emit_insn (gen_lsx_vmulwev_<dlsxfmt>_<lsxfmt><u> (t_even, operands[1],
+	operands[2]));
+  emit_insn (gen_lsx_vmulwod_<dlsxfmt>_<lsxfmt><u> (t_odd, operands[1],
+	operands[2]));
+  if (<zero_one>)
+    emit_insn (gen_lsx_vilvh_<dlsxfmt> (operands[0], t_even, t_odd));
+  else
+    emit_insn (gen_lsx_vilvl_<dlsxfmt> (operands[0], t_even, t_odd));
+
+  DONE;
+})
diff --git a/gcc/config/loongarch/simd.md b/gcc/config/loongarch/simd.md
index 4156b26..b73f65a 100644
--- a/gcc/config/loongarch/simd.md
+++ b/gcc/config/loongarch/simd.md
@@ -23,6 +23,10 @@
 ;; Integer modes supported by LASX.
 (define_mode_iterator ILASX   [V4DI V8SI V16HI V32QI])
 
+;; Only integer modes smaller than a word.
+(define_mode_iterator ILSX_HB  [V8HI V16QI])
+(define_mode_iterator ILASX_HB  [V16HI V32QI])
+
 ;; FP modes supported by LSX
 (define_mode_iterator FLSX    [V2DF V4SF])
 
@@ -38,6 +42,10 @@
 ;; All integer modes available
 (define_mode_iterator IVEC    [(ILSX "ISA_HAS_LSX") (ILASX "ISA_HAS_LASX")])
 
+;; All integer modes smaller than a word.
+(define_mode_iterator IVEC_HB [(ILSX_HB "ISA_HAS_LSX")
+			       (ILASX_HB "ISA_HAS_LASX")])
+
 ;; All FP modes available
 (define_mode_iterator FVEC    [(FLSX "ISA_HAS_LSX") (FLASX "ISA_HAS_LASX")])
 
@@ -90,12 +98,18 @@
 			     (V8HI "V4SI") (V16HI "V8SI")
 			     (V16QI "V8HI") (V32QI "V16HI")])
 
+(define_mode_attr WVEC_QUARTER [(V8HI "V2DI") (V16HI "V4DI")
+				(V16QI "V4SI") (V32QI "V8SI")])
+
 ;; Lower-case version.
 (define_mode_attr wvec_half [(V2DI "v1ti") (V4DI "v2ti")
 			     (V4SI "v2di") (V8SI "v4di")
 			     (V8HI "v4si") (V16HI "v8si")
 			     (V16QI "v8hi") (V32QI "v16hi")])
 
+(define_mode_attr wvec_quarter [(V8HI "v2di") (V16HI "v4di")
+				(V16QI "v4si") (V32QI "v8si")])
+
 ;; Integer vector modes with the same length and unit size as a mode.
 (define_mode_attr VIMODE [(V2DI "V2DI") (V4SI "V4SI")
 			  (V8HI "V8HI") (V16QI "V16QI")
@@ -124,12 +138,16 @@
 			   (V8HI "h") (V16HI "h")
 			   (V16QI "b") (V32QI "b")])
 
-;; Suffix for widening LSX or LASX instructions.
+;; Suffix for double widening LSX or LASX instructions.
 (define_mode_attr simdfmt_w [(V2DI "q") (V4DI "q")
 			     (V4SI "d") (V8SI "d")
 			     (V8HI "w") (V16HI "w")
 			     (V16QI "h") (V32QI "h")])
 
+;; Suffix for quadruple widening LSX or LASX instructions.
+(define_mode_attr simdfmt_qw [(V8HI "d") (V16HI "d")
+			     (V16QI "w") (V32QI "w")])
+
 ;; Suffix for integer mode in LSX or LASX instructions with FP input but
 ;; integer output.
 (define_mode_attr simdifmt_for_f [(V2DF "l") (V4DF "l")
@@ -169,6 +187,8 @@
 			  (V4SI  "uimm5") (V8SI "uimm5")
 			  (V2DI  "uimm6") (V4DI "uimm6")])
 
+(define_int_attr hi_lo [(0 "lo") (1 "hi")])
+
 ;; =======================================================================
 ;; For many LASX instructions, the only difference of it from the LSX
 ;; counterpart is the length of vector operands.  Describe these LSX/LASX
@@ -431,6 +451,17 @@
   [(set_attr "type" "simd_int_arith")
    (set_attr "mode" "<MODE>")])
 
+;; <x>vfnmsub.{s/d}
+(define_insn "fnma<mode>4"
+  [(set (match_operand:FVEC 0 "register_operand" "=f")
+	(fma:FVEC (neg:FVEC (match_operand:FVEC 1 "register_operand" "f"))
+		  (match_operand:FVEC 2 "register_operand" "f")
+		  (match_operand:FVEC 3 "register_operand" "f")))]
+  "!HONOR_SIGNED_ZEROS (<MODE>mode)"
+  "<x>vfnmsub.<simdfmt>\t%<wu>0,%<wu>1,%<wu>2,%<wu>3"
+  [(set_attr "type" "simd_fmadd")
+   (set_attr "mode" "<MODE>")])
+
 ;; <x>vfcmp.*.{s/d} with defined RTX code
 ;; There are no fcmp.{sugt/suge/cgt/cge}.{s/d} menmonics in GAS, so we have
 ;; to reverse the operands ourselves :(.
@@ -826,6 +857,39 @@
   DONE;
 })
 
+(define_expand "<su>dot_prod<wvec_quarter><mode>"
+ [(match_operand:<WVEC_QUARTER> 0 "register_operand" "=f,f")
+  (match_operand:IVEC_HB 1 "register_operand" "f,f")
+  (match_operand:IVEC_HB 2 "register_operand" "f,f")
+  (match_operand:<WVEC_QUARTER> 3 "reg_or_0_operand" "f, YG")
+  (any_extend (const_int 0))]
+  ""
+{
+  rtx *op = operands;
+  rtx res_mulev = gen_reg_rtx (<WVEC_HALF>mode);
+  rtx res_mulod = gen_reg_rtx (<WVEC_HALF>mode);
+  rtx res_addev = gen_reg_rtx (<WVEC_QUARTER>mode);
+  rtx res_addod = gen_reg_rtx (<WVEC_QUARTER>mode);
+  emit_insn (gen_<simd_isa>_<x>vmulwev_<simdfmt_w>_<simdfmt><u>
+	      (res_mulev, op[1], op[2]));
+  emit_insn (gen_<simd_isa>_<x>vmulwod_<simdfmt_w>_<simdfmt><u>
+	      (res_mulod, op[1], op[2]));
+  emit_insn (gen_<simd_isa>_<x>vhaddw_<simdfmt_qw><u>_<simdfmt_w><u>
+              (res_addev, res_mulev, res_mulev));
+  emit_insn (gen_<simd_isa>_<x>vhaddw_<simdfmt_qw><u>_<simdfmt_w><u>
+	      (res_addod, res_mulod, res_mulod));
+  if (op[3] == CONST0_RTX (<WVEC_QUARTER>mode))
+    emit_insn (gen_add<wvec_quarter>3 (op[0], res_addev,
+				       res_addod));
+  else
+    {
+      emit_insn (gen_add<wvec_quarter>3 (res_addev, res_addev,
+					 res_addod));
+      emit_insn (gen_add<wvec_quarter>3 (op[0], res_addev, op[3]));
+    }
+  DONE;
+})
+
 (define_insn "simd_maddw_evod_<mode>_hetero"
   [(set (match_operand:<WVEC_HALF> 0 "register_operand" "=f")
 	(plus:<WVEC_HALF>
@@ -972,6 +1036,77 @@
   DONE;
 })
 
+(define_insn "xor<mode>3"
+  [(set (match_operand:ALLVEC 0 "register_operand" "=f,f,f")
+	(xor:ALLVEC
+	  (match_operand:ALLVEC 1 "register_operand" "f,f,f")
+	  (match_operand:ALLVEC 2 "reg_or_vector_same_val_operand" "f,YC,Urv8")))]
+  ""
+  "@
+   <x>vxor.v\t%<wu>0,%<wu>1,%<wu>2
+   <x>vbitrevi.%v0\t%<wu>0,%<wu>1,%V2
+   <x>vxori.b\t%<wu>0,%<wu>1,%B2"
+  [(set_attr "type" "simd_logic,simd_bit,simd_logic")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "ior<mode>3"
+  [(set (match_operand:ALLVEC 0 "register_operand" "=f,f,f")
+	(ior:ALLVEC
+	  (match_operand:ALLVEC 1 "register_operand" "f,f,f")
+	  (match_operand:ALLVEC 2 "reg_or_vector_same_val_operand" "f,YC,Urv8")))]
+  ""
+  "@
+   <x>vor.v\t%<wu>0,%<wu>1,%<wu>2
+   <x>vbitseti.%v0\t%<wu>0,%<wu>1,%V2
+   <x>vori.b\t%<wu>0,%<wu>1,%B2"
+  [(set_attr "type" "simd_logic,simd_bit,simd_logic")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "and<mode>3"
+  [(set (match_operand:ALLVEC 0 "register_operand" "=f,f,f")
+	(and:ALLVEC
+	  (match_operand:ALLVEC 1 "register_operand" "f,f,f")
+	  (match_operand:ALLVEC 2 "reg_or_vector_same_val_operand" "f,YZ,Urv8")))]
+  ""
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "<x>vand.v\t%<wu>0,%<wu>1,%<wu>2";
+    case 1:
+      {
+	rtx elt0 = CONST_VECTOR_ELT (operands[2], 0);
+	unsigned HOST_WIDE_INT val;
+	if (GET_MODE_CLASS (<MODE>mode) == MODE_VECTOR_FLOAT)
+	  {
+	  const REAL_VALUE_TYPE *x = CONST_DOUBLE_REAL_VALUE (elt0);
+	  if (GET_MODE (elt0) == DFmode)
+	    {
+	      long tmp[2];
+	      REAL_VALUE_TO_TARGET_DOUBLE (*x, tmp);
+	      val = ~((unsigned HOST_WIDE_INT) tmp[1] << 32 | tmp[0]);
+	    }
+	  else
+	    {
+	      long tmp;
+	      REAL_VALUE_TO_TARGET_SINGLE (*x, tmp);
+	      val = ~((unsigned HOST_WIDE_INT) tmp);
+	    }
+	  }
+	else
+	  val = ~UINTVAL (elt0);
+	operands[2] = loongarch_gen_const_int_vector (<VIMODE>mode, val & (-val));
+	return "<x>vbitclri.%v0\t%<wu>0,%<wu>1,%V2";
+      }
+    case 2:
+      return "<x>vandi.b\t%<wu>0,%<wu>1,%B2";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "simd_logic,simd_bit,simd_logic")
+   (set_attr "mode" "<MODE>")])
+
 ; The LoongArch SX Instructions.
 (include "lsx.md")
 
diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index 59b71ed..697198f 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -1,4 +1,4 @@
-;); Machine description for RISC-V Bit Manipulation operations.
+;; Machine description for RISC-V Bit Manipulation operations.
 ;; Copyright (C) 2021-2025 Free Software Foundation, Inc.
 
 ;; This file is part of GCC.
@@ -237,19 +237,20 @@
   [(set_attr "type" "bitmanip")
    (set_attr "mode" "<X:MODE>")])
 
-(define_insn_and_split "*<optab>_not_const<mode>"
-  [(set (match_operand:X 0 "register_operand" "=r")
-       (bitmanip_bitwise:X (not:X (match_operand:X 1 "register_operand" "r"))
-              (match_operand:X 2 "const_arith_operand" "I")))
-  (clobber (match_scratch:X 3 "=&r"))]
+(define_peephole2
+  [(match_scratch:X 4 "r")
+   (set (match_operand:X 0 "register_operand")
+	(not:X (match_operand:X 1 "register_operand")))
+   (set (match_operand:X 2 "register_operand")
+	(bitmanip_bitwise:X (match_dup 0)
+			    (match_operand 3 "const_int_operand")))
+   (match_dup 4)]
   "(TARGET_ZBB || TARGET_ZBKB) && !TARGET_ZCB
-   && !optimize_function_for_size_p (cfun)"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 3) (match_dup 2))
-   (set (match_dup 0) (bitmanip_bitwise:X (not:X (match_dup 1)) (match_dup 3)))]
-  ""
-  [(set_attr "type" "bitmanip")])
+   && !optimize_function_for_size_p (cfun)
+   && rtx_equal_p (operands[0], operands[2])
+   && riscv_const_insns (operands[3], false) == 1"
+  [(set (match_dup 4) (match_dup 3))
+   (set (match_dup 0) (bitmanip_bitwise:X (not:X (match_dup 1)) (match_dup 4)))])
 
 ;; '(a >= 0) ? b : 0' is emitted branchless (from if-conversion).  Without a
 ;; bit of extra help for combine (i.e., the below split), we end up emitting
diff --git a/gcc/config/riscv/riscv-avlprop.cc b/gcc/config/riscv/riscv-avlprop.cc
index b8547a7..a42764e 100644
--- a/gcc/config/riscv/riscv-avlprop.cc
+++ b/gcc/config/riscv/riscv-avlprop.cc
@@ -77,6 +77,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "df.h"
 #include "rtl-ssa.h"
+#include "rtl-iter.h"
 #include "cfgcleanup.h"
 #include "insn-attr.h"
 #include "tm-constrs.h"
@@ -412,6 +413,46 @@ pass_avlprop::get_vlmax_ta_preferred_avl (insn_info *insn) const
 		  && def1->insn ()->compare_with (insn) >= 0)
 		return NULL_RTX;
 	    }
+	  else
+	    {
+	      /* If the use is in a subreg e.g. in a store it is possible that
+		 we punned the vector mode with a larger mode like
+		   (subreg:V1SI (reg:V4QI 123)).
+		 For an AVL of 1 that means we actually store one SImode
+		 element and not 1 QImode elements.  But the latter is what we
+		 would propagate if we took the AVL operand literally.
+		 Instead we scale it by the ratio of inner and outer mode
+		 (4 in the example above).  */
+	      int factor = 1;
+	      if (use->includes_subregs ())
+		{
+		  subrtx_iterator::array_type array;
+		  FOR_EACH_SUBRTX (iter, array, use_insn->rtl (), NONCONST)
+		    {
+		      const_rtx x = *iter;
+		      if (x
+			  && SUBREG_P (x)
+			  && REG_P (SUBREG_REG (x))
+			  && REGNO (SUBREG_REG (x)) == use->regno ()
+			  && known_eq (GET_MODE_SIZE (use->mode ()),
+				       GET_MODE_SIZE (GET_MODE (x))))
+			{
+			  if (can_div_trunc_p (GET_MODE_NUNITS (use->mode ()),
+					       GET_MODE_NUNITS (GET_MODE (x)),
+					       &factor))
+			    {
+			      gcc_assert (factor > 0);
+			      break;
+			    }
+			  else
+			    return NULL_RTX;
+			}
+		    }
+		}
+
+	      if (factor > 1)
+		new_use_avl = GEN_INT (INTVAL (new_use_avl) * factor);
+	    }
 
 	  if (!use_avl)
 	    use_avl = new_use_avl;
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 013b1dd..570acb1 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -209,6 +209,11 @@ rtl_opt_pass * make_pass_insert_landing_pad (gcc::context *ctxt);
 rtl_opt_pass * make_pass_vector_permconst (gcc::context *ctxt);
 rtl_opt_pass * make_pass_bclr_lowest_set_bit (gcc::context *ctxt);
 
+/* Routines implemented in riscv-vsetvl.cc.  */
+extern bool has_vtype_op (rtx_insn *);
+extern bool mask_agnostic_p (rtx_insn *);
+extern rtx get_avl (rtx_insn *);
+extern bool vsetvl_insn_p (rtx_insn *);
 
 /* Routines implemented in riscv-string.c.  */
 extern bool riscv_expand_block_compare (rtx, rtx, rtx, rtx);
@@ -834,7 +839,8 @@ extern bool th_print_operand_address (FILE *, machine_mode, rtx);
 extern bool strided_load_broadcast_p (void);
 extern bool riscv_prefer_agnostic_p (void);
 extern bool riscv_use_divmod_expander (void);
-void riscv_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx, tree, int);
+void riscv_init_cumulative_args (CUMULATIVE_ARGS *, const_tree,
+				 rtx, tree, int, bool);
 extern bool
 riscv_option_valid_attribute_p (tree, tree, tree, int);
 extern bool
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 7e4d396..22b77cc 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -1793,12 +1793,13 @@ public:
        The fold routines expect the replacement statement to have the
        same lhs as the original call, so return the copy statement
        rather than the field update.  */
-    gassign *copy = gimple_build_assign (unshare_expr (f.lhs), rhs_tuple);
+    gassign *copy = gimple_build_assign (f.lhs, rhs_tuple);
 
     /* Get a reference to the individual vector.  */
     tree field = tuple_type_field (TREE_TYPE (f.lhs));
     tree lhs_array
-      = build3 (COMPONENT_REF, TREE_TYPE (field), f.lhs, field, NULL_TREE);
+      = build3 (COMPONENT_REF, TREE_TYPE (field), unshare_expr (f.lhs),
+		field, NULL_TREE);
     tree lhs_vector = build4 (ARRAY_REF, TREE_TYPE (rhs_vector), lhs_array,
 			      index, NULL_TREE, NULL_TREE);
     gassign *update = gimple_build_assign (lhs_vector, rhs_vector);
diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 3586d0c..580ac9c 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -258,7 +258,7 @@ policy_to_str (bool agnostic_p)
 
 /* Return true if it is an RVV instruction depends on VTYPE global
    status register.  */
-static bool
+bool
 has_vtype_op (rtx_insn *rinsn)
 {
   return recog_memoized (rinsn) >= 0 && get_attr_has_vtype_op (rinsn);
@@ -306,7 +306,7 @@ vector_config_insn_p (rtx_insn *rinsn)
 }
 
 /* Return true if it is vsetvldi or vsetvlsi.  */
-static bool
+bool
 vsetvl_insn_p (rtx_insn *rinsn)
 {
   if (!rinsn || !vector_config_insn_p (rinsn))
@@ -386,7 +386,7 @@ get_vl (rtx_insn *rinsn)
 }
 
 /* Helper function to get AVL operand.  */
-static rtx
+rtx
 get_avl (rtx_insn *rinsn)
 {
   if (vsetvl_insn_p (rinsn) || vsetvl_discard_result_insn_p (rinsn))
@@ -411,7 +411,7 @@ get_default_ma ()
 }
 
 /* Helper function to get MA operand.  */
-static bool
+bool
 mask_agnostic_p (rtx_insn *rinsn)
 {
   /* If it doesn't have MA, we return agnostic by default.  */
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index d5de76c..e978f92 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -740,6 +740,7 @@ static tree riscv_handle_fndecl_attribute (tree *, tree, tree, int, bool *);
 static tree riscv_handle_type_attribute (tree *, tree, tree, int, bool *);
 static tree riscv_handle_rvv_vector_bits_attribute (tree *, tree, tree, int,
 						    bool *);
+static tree riscv_handle_rvv_vls_cc_attribute (tree *, tree, tree, int, bool *);
 
 /* Defining target-specific uses of __attribute__.  */
 static const attribute_spec riscv_gnu_attributes[] =
@@ -763,6 +764,8 @@ static const attribute_spec riscv_gnu_attributes[] =
     standard vector calling convention variant. Syntax:
     __attribute__((riscv_vector_cc)). */
   {"riscv_vector_cc", 0, 0, false, true, true, true, NULL, NULL},
+  {"riscv_vls_cc", 0, 1, false, true, true, true,
+   riscv_handle_rvv_vls_cc_attribute, NULL},
   /* This attribute is used to declare a new type, to appoint the exactly
      bits size of the type.  For example:
 
@@ -790,6 +793,8 @@ static const attribute_spec riscv_attributes[] =
      standard vector calling convention variant. Syntax:
      [[riscv::vector_cc]]. */
   {"vector_cc", 0, 0, false, true, true, true, NULL, NULL},
+  {"vls_cc", 0, 1, false, true, true, true, riscv_handle_rvv_vls_cc_attribute,
+   NULL},
   /* This attribute is used to declare a new type, to appoint the exactly
      bits size of the type.  For example:
 
@@ -3723,6 +3728,12 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx src)
 	      riscv_vector::emit_vec_extract (result, v,
 					      gen_int_mode (index + i, Pmode));
 
+	      /* The low-part must be zero-extended when ELEN == 32 and
+		 mode == 64.  */
+	      if (num == 2 && i == 0)
+		emit_insn (gen_extend_insn (int_reg, result, mode, smode,
+					    true));
+
 	      if (i == 1)
 		{
 		  if (UNITS_PER_WORD < mode_size)
@@ -5872,11 +5883,12 @@ typedef struct {
    floating-point registers.  */
 
 static int
-riscv_flatten_aggregate_field (const_tree type,
-			       riscv_aggregate_field fields[2],
+riscv_flatten_aggregate_field (const_tree type, riscv_aggregate_field *fields,
 			       int n, HOST_WIDE_INT offset,
-			       bool ignore_zero_width_bit_field_p)
+			       bool ignore_zero_width_bit_field_p,
+			       bool vls_p = false, unsigned abi_vlen = 0)
 {
+  int max_aggregate_field = vls_p ? 8 : 2;
   switch (TREE_CODE (type))
     {
     case RECORD_TYPE:
@@ -5903,9 +5915,9 @@ riscv_flatten_aggregate_field (const_tree type,
 	    else
 	      {
 		HOST_WIDE_INT pos = offset + int_byte_position (f);
-		n = riscv_flatten_aggregate_field (TREE_TYPE (f),
-						   fields, n, pos,
-						   ignore_zero_width_bit_field_p);
+		n = riscv_flatten_aggregate_field (
+		  TREE_TYPE (f), fields, n, pos, ignore_zero_width_bit_field_p,
+		  vls_p, abi_vlen);
 	      }
 	    if (n < 0)
 	      return -1;
@@ -5915,13 +5927,14 @@ riscv_flatten_aggregate_field (const_tree type,
     case ARRAY_TYPE:
       {
 	HOST_WIDE_INT n_elts;
-	riscv_aggregate_field subfields[2];
+	riscv_aggregate_field subfields[8];
 	tree index = TYPE_DOMAIN (type);
 	tree elt_size = TYPE_SIZE_UNIT (TREE_TYPE (type));
-	int n_subfields = riscv_flatten_aggregate_field (TREE_TYPE (type),
-							 subfields, 0, offset,
-							 ignore_zero_width_bit_field_p);
-
+	int n_subfields
+	  = riscv_flatten_aggregate_field (TREE_TYPE (type), subfields, 0,
+					   offset,
+					   ignore_zero_width_bit_field_p, vls_p,
+					   abi_vlen);
 	/* Can't handle incomplete types nor sizes that are not fixed.  */
 	if (n_subfields <= 0
 	    || !COMPLETE_TYPE_P (type)
@@ -5941,7 +5954,7 @@ riscv_flatten_aggregate_field (const_tree type,
 	for (HOST_WIDE_INT i = 0; i < n_elts; i++)
 	  for (int j = 0; j < n_subfields; j++)
 	    {
-	      if (n >= 2)
+	      if (n >= max_aggregate_field)
 		return -1;
 
 	      fields[n] = subfields[j];
@@ -5973,18 +5986,36 @@ riscv_flatten_aggregate_field (const_tree type,
       }
 
     default:
-      if (n < 2
-	  && ((SCALAR_FLOAT_TYPE_P (type)
-	       && GET_MODE_SIZE (TYPE_MODE (type)).to_constant () <= UNITS_PER_FP_ARG)
-	      || (INTEGRAL_TYPE_P (type)
-		  && GET_MODE_SIZE (TYPE_MODE (type)).to_constant () <= UNITS_PER_WORD)))
+      poly_uint64 mode_size = GET_MODE_SIZE (TYPE_MODE (type));
+      if (vls_p)
 	{
-	  fields[n].type = type;
-	  fields[n].offset = offset;
-	  return n + 1;
+	  gcc_assert (abi_vlen != 0);
+	  if (n < max_aggregate_field
+	      && (VECTOR_TYPE_P (type) && mode_size.is_constant ()
+		  && (mode_size.to_constant () <= abi_vlen * 8)))
+	    {
+	      fields[n].type = type;
+	      fields[n].offset = offset;
+	      return n + 1;
+	    }
+	  else
+	    return -1;
 	}
       else
-	return -1;
+	{
+	  if (n < max_aggregate_field
+	      && ((SCALAR_FLOAT_TYPE_P (type)
+		   && mode_size.to_constant () <= UNITS_PER_FP_ARG)
+		  || (INTEGRAL_TYPE_P (type)
+		      && mode_size.to_constant () <= UNITS_PER_WORD)))
+	    {
+	      fields[n].type = type;
+	      fields[n].offset = offset;
+	      return n + 1;
+	    }
+	  else
+	    return -1;
+	}
     }
 }
 
@@ -5993,14 +6024,16 @@ riscv_flatten_aggregate_field (const_tree type,
 
 static int
 riscv_flatten_aggregate_argument (const_tree type,
-				  riscv_aggregate_field fields[2],
-				  bool ignore_zero_width_bit_field_p)
+				  riscv_aggregate_field *fields,
+				  bool ignore_zero_width_bit_field_p,
+				  bool vls_p = false, unsigned abi_vlen = 0)
 {
   if (!type || TREE_CODE (type) != RECORD_TYPE)
     return -1;
 
   return riscv_flatten_aggregate_field (type, fields, 0, 0,
-					ignore_zero_width_bit_field_p);
+					ignore_zero_width_bit_field_p, vls_p,
+					abi_vlen);
 }
 
 /* See whether TYPE is a record whose fields should be returned in one or
@@ -6163,18 +6196,22 @@ riscv_pass_vls_aggregate_in_gpr (struct riscv_arg_info *info, machine_mode mode,
   return gen_rtx_PARALLEL (mode, gen_rtvec (1, x));
 }
 
+static const predefined_function_abi &
+riscv_fntype_abi_1 (const_tree fntype, bool check_only);
+
 /* Initialize a variable CUM of type CUMULATIVE_ARGS
    for a call to a function whose data type is FNTYPE.
    For a library call, FNTYPE is 0.  */
 
 void
 riscv_init_cumulative_args (CUMULATIVE_ARGS *cum, const_tree fntype,
-			    rtx, tree, int)
+			    rtx, tree, int, bool check_only)
 {
   memset (cum, 0, sizeof (*cum));
 
   if (fntype)
-    cum->variant_cc = (riscv_cc) fntype_abi (fntype).id ();
+    cum->variant_cc =
+      (riscv_cc) riscv_fntype_abi_1 (fntype, check_only).id ();
   else
     cum->variant_cc = RISCV_CC_BASE;
 }
@@ -6197,7 +6234,7 @@ riscv_hard_regno_nregs (unsigned int regno, machine_mode mode);
 
 static rtx
 riscv_get_vector_arg (struct riscv_arg_info *info, const CUMULATIVE_ARGS *cum,
-		      machine_mode mode, bool return_p)
+		      machine_mode mode, bool return_p, bool vls_p = false)
 {
   gcc_assert (riscv_v_ext_mode_p (mode));
 
@@ -6233,8 +6270,9 @@ riscv_get_vector_arg (struct riscv_arg_info *info, const CUMULATIVE_ARGS *cum,
   int arg_reg_end = V_ARG_LAST - V_REG_FIRST;
   int aligned_reg_start = ROUND_UP (arg_reg_start, LMUL);
 
-  /* For scalable data and scalable tuple return value.  */
-  if (return_p)
+  /* For scalable data and scalable tuple return value.
+     For VLS CC, we may pass struct like tuple, so need defer the handling.  */
+  if (return_p && !vls_p)
     return gen_rtx_REG (mode, aligned_reg_start + V_REG_FIRST);
 
   /* Iterate through the USED_VRS array to find vector register groups that have
@@ -6271,6 +6309,224 @@ riscv_get_vector_arg (struct riscv_arg_info *info, const CUMULATIVE_ARGS *cum,
   return NULL_RTX;
 }
 
+
+#define RISCV_ALL_VALID_ABI_VLEN(F) \
+  F (32) \
+  F (64) \
+  F (128) \
+  F (256) \
+  F (512) \
+  F (1024) \
+  F (2048) \
+  F (4096) \
+  F (8192) \
+  F (16384)
+
+/* Return true if CC is a variant of VLS CC.  */
+
+static bool
+riscv_vls_cc_p (riscv_cc cc)
+{
+  switch (cc)
+    {
+#define VLS_CC_ABI_VLEN_CASE(ABI_VLEN) \
+  case RISCV_CC_VLS_V_##ABI_VLEN:
+    RISCV_ALL_VALID_ABI_VLEN (VLS_CC_ABI_VLEN_CASE)
+
+#undef VLS_CC_ABI_VLEN_CASE
+      return true;
+    default:
+      return false;
+    }
+}
+
+/* Get ABI_VLEN from cc.  */
+
+static unsigned int
+riscv_get_cc_abi_vlen (riscv_cc cc)
+{
+  switch (cc)
+    {
+#define VLS_CC_ABI_VLEN_CASE(ABI_VLEN) \
+      case RISCV_CC_VLS_V_##ABI_VLEN: \
+	return ABI_VLEN;
+    RISCV_ALL_VALID_ABI_VLEN (VLS_CC_ABI_VLEN_CASE)
+
+#undef VLS_CC_ABI_VLEN_CASE
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Return true if ABI_VLEN is a valid for VLS_CC.  */
+
+static bool
+riscv_valid_abi_vlen_vls_cc_p (unsigned abi_vlen)
+{
+  switch (abi_vlen)
+    {
+#define VLS_CC_ABI_VLEN_CASE(ABI_VLEN) \
+  case ABI_VLEN:
+    RISCV_ALL_VALID_ABI_VLEN (VLS_CC_ABI_VLEN_CASE)
+
+#undef VLS_CC_ABI_VLEN_CASE
+      return true;
+    default:
+      return false;
+    }
+}
+
+static riscv_cc
+riscv_get_riscv_cc_by_abi_vlen (unsigned abi_vlen)
+{
+  switch (abi_vlen)
+    {
+#define VLS_CC_ABI_VLEN_CASE(ABI_VLEN) \
+      case ABI_VLEN: \
+	return RISCV_CC_VLS_V_##ABI_VLEN;
+    RISCV_ALL_VALID_ABI_VLEN (VLS_CC_ABI_VLEN_CASE)
+
+#undef VLS_CC_ABI_VLEN_CASE
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Get a VLS type has same size as MODE in ABI_VLEN, but element is always
+   in integer mode.  */
+
+static machine_mode
+riscv_get_vls_container_type (machine_mode mode, unsigned abi_vlen)
+{
+  machine_mode element_mode = GET_MODE_INNER (mode);
+  unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
+  unsigned int lmul = ROUND_UP (mode_size * 8, abi_vlen) / abi_vlen;
+
+  /* Always use integer mode to pass to simplify the logic - we allow pass
+     unsupported vector type in vector register, e.g. float16x4_t even no vector
+     fp16 support.  */
+  switch (GET_MODE_SIZE (element_mode).to_constant ())
+    {
+    case 1:
+      element_mode = QImode;
+      break;
+    case 2:
+      element_mode = HImode;
+      break;
+    case 4:
+      element_mode = SImode;
+      break;
+    case 8:
+      element_mode = DImode;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  scalar_mode smode = as_a<scalar_mode> (element_mode);
+  return get_lmul_mode (smode, lmul).require ();
+}
+
+/* Pass VLS type argument in vector argument register.  */
+
+static rtx
+riscv_pass_vls_in_vr (struct riscv_arg_info *info, const CUMULATIVE_ARGS *cum,
+		      machine_mode mode, bool return_p)
+{
+  gcc_assert (riscv_v_ext_vls_mode_p (mode));
+
+  unsigned int abi_vlen = riscv_get_cc_abi_vlen (cum->variant_cc);
+  unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
+  unsigned int lmul = ROUND_UP (mode_size * 8, abi_vlen) / abi_vlen;
+
+  /* Put into memory if it need more than 8 registers (> LMUL 8).  */
+  if (lmul > 8)
+    return NULL_RTX;
+
+  machine_mode vla_mode = riscv_get_vls_container_type (mode, abi_vlen);
+  rtx reg = riscv_get_vector_arg (info, cum, vla_mode,
+				  return_p, /* vls_p */ true);
+
+  /* Can't get vector register to pass, pass by memory.  */
+  if (!reg)
+    return NULL_RTX;
+
+  PUT_MODE (reg, mode);
+
+  return reg;
+}
+
+/* Pass aggregate with VLS type argument in vector argument registers.  */
+
+static rtx
+riscv_pass_aggregate_in_vr (struct riscv_arg_info *info,
+			    const CUMULATIVE_ARGS *cum, const_tree type,
+			    bool return_p)
+{
+  riscv_aggregate_field fields[8];
+  unsigned int abi_vlen = riscv_get_cc_abi_vlen (cum->variant_cc);
+  int i;
+  int n = riscv_flatten_aggregate_argument (type, fields, true,
+					    /* vls_p */ true, abi_vlen);
+
+  if (n == -1)
+    return NULL_RTX;
+
+  /* Check all field has same size.  */
+  unsigned int mode_size
+    = GET_MODE_SIZE (TYPE_MODE (fields[0].type)).to_constant ();
+  for (int i = 1; i < n; i++)
+    if (GET_MODE_SIZE (TYPE_MODE (fields[i].type)).to_constant () != mode_size)
+      return NULL_RTX; /* Return NULL_RTX if we cannot find a suitable reg.  */
+
+  /* Check total size is <= abi_vlen * 8, we use up to 8 vector register to
+     pass argument.  */
+  if (mode_size * 8 > abi_vlen)
+    return NULL_RTX; /* Return NULL_RTX if we cannot find a suitable reg.  */
+
+  /* Backup cum->used_vrs since we will defer the update until
+     riscv_function_arg_advance.  */
+  CUMULATIVE_ARGS local_cum;
+  memcpy (&local_cum, cum, sizeof (local_cum));
+
+  unsigned num_vrs = 0;
+
+  /* Allocate vector registers for the arguments.  */
+  rtx expr_list[8];
+  for (i = 0; i < n; i++)
+    {
+      machine_mode mode = TYPE_MODE (fields[i].type);
+      machine_mode vla_mode = riscv_get_vls_container_type (mode, abi_vlen);
+      /* Use riscv_get_vector_arg with VLA type to simplify the calling
+	 convention implementation.  */
+      rtx reg
+	= riscv_get_vector_arg (info, &local_cum, vla_mode,
+				return_p, /* vls_p */true);
+
+      /* Can't get vector register to pass, pass by memory.  */
+      if (!reg)
+	return NULL_RTX;
+
+      PUT_MODE (reg, mode);
+
+      expr_list[i]
+	= gen_rtx_EXPR_LIST (VOIDmode, reg, GEN_INT (fields[i].offset));
+
+      num_vrs += info->num_vrs;
+
+      /* Set the corresponding register in USED_VRS to used status.  */
+      for (unsigned int i = 0; i < info->num_vrs; i++)
+	{
+	  gcc_assert (!local_cum.used_vrs[info->vr_offset + i]);
+	  local_cum.used_vrs[info->vr_offset + i] = true;
+	}
+    }
+
+  info->num_vrs = num_vrs;
+
+  return gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (n, expr_list));
+}
+
 /* Fill INFO with information about a single argument, and return an RTL
    pattern to pass or return the argument. Return NULL_RTX if argument cannot
    pass or return in registers, then the argument may be passed by reference or
@@ -6363,7 +6619,17 @@ riscv_get_arg_info (struct riscv_arg_info *info, const CUMULATIVE_ARGS *cum,
       if (riscv_vector_type_p (type) && riscv_v_ext_mode_p (mode))
 	return riscv_get_vector_arg (info, cum, mode, return_p);
 
-      /* For vls mode aggregated in gpr.  */
+      if (riscv_vls_cc_p (cum->variant_cc))
+	{
+	  if (riscv_v_ext_vls_mode_p (mode))
+	    return riscv_pass_vls_in_vr (info, cum, mode, return_p);
+
+	  rtx ret = riscv_pass_aggregate_in_vr (info, cum, type, return_p);
+	  if (ret)
+	    return ret;
+	}
+
+      /* For vls mode aggregated in gpr (for non-VLS-CC).  */
       if (riscv_v_ext_vls_mode_p (mode))
 	return riscv_pass_vls_aggregate_in_gpr (info, mode, gpr_base);
     }
@@ -6420,7 +6686,8 @@ riscv_function_arg_advance (cumulative_args_t cum_v,
       cum->used_vrs[info.vr_offset + i] = true;
     }
 
-  if ((info.num_vrs > 0 || info.num_mrs > 0) && cum->variant_cc != RISCV_CC_V)
+  if ((info.num_vrs > 0 || info.num_mrs > 0) && cum->variant_cc != RISCV_CC_V
+      && !riscv_vls_cc_p (cum->variant_cc))
     {
       error ("RVV type %qT cannot be passed to an unprototyped function",
 	     arg.type);
@@ -6463,7 +6730,8 @@ riscv_function_value (const_tree ret_type, const_tree fn_decl_or_type,
     {
       const_tree fntype = TREE_CODE (fn_decl_or_type) == FUNCTION_DECL ?
 			    TREE_TYPE (fn_decl_or_type) : fn_decl_or_type;
-      riscv_init_cumulative_args (&args, fntype, NULL_RTX, NULL_TREE, 0);
+      riscv_init_cumulative_args (&args, fntype, NULL_RTX, NULL_TREE, 0,
+				  /* check_only */true);
     }
   else
     memset (&args, 0, sizeof args);
@@ -6532,14 +6800,20 @@ riscv_pass_by_reference (cumulative_args_t cum_v, const function_arg_info &arg)
 /* Implement TARGET_RETURN_IN_MEMORY.  */
 
 static bool
-riscv_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
+riscv_return_in_memory (const_tree type, const_tree fntype)
 {
   CUMULATIVE_ARGS args;
+
+  if (fntype)
+    riscv_init_cumulative_args (&args, fntype, NULL_RTX, NULL_TREE, 0,
+				/* check_only */true);
+  else
+    /* The rules for returning in memory are the same as for passing the
+       first named argument by reference.  */
+    memset (&args, 0, sizeof args);
+
   cumulative_args_t cum = pack_cumulative_args (&args);
 
-  /* The rules for returning in memory are the same as for passing the
-     first named argument by reference.  */
-  memset (&args, 0, sizeof args);
   function_arg_info arg (const_cast<tree> (type), /*named=*/true);
   return riscv_pass_by_reference (cum, arg);
 }
@@ -6583,9 +6857,9 @@ riscv_setup_incoming_varargs (cumulative_args_t cum,
 /* Return the descriptor of the Standard Vector Calling Convention Variant.  */
 
 static const predefined_function_abi &
-riscv_v_abi ()
+riscv_v_abi (riscv_cc abi)
 {
-  predefined_function_abi &v_abi = function_abis[RISCV_CC_V];
+  predefined_function_abi &v_abi = function_abis[abi];
   if (!v_abi.initialized_p ())
     {
       HARD_REG_SET full_reg_clobbers
@@ -6595,7 +6869,7 @@ riscv_v_abi ()
 	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
       for (int regno = V_REG_FIRST + 24; regno <= V_REG_FIRST + 31; regno += 1)
 	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
-      v_abi.initialize (RISCV_CC_V, full_reg_clobbers);
+      v_abi.initialize (abi, full_reg_clobbers);
     }
   return v_abi;
 }
@@ -6756,13 +7030,14 @@ riscv_validate_vector_type (const_tree type, const char *hint)
    RISC-V V registers.  */
 
 static bool
-riscv_return_value_is_vector_type_p (const_tree fntype)
+riscv_return_value_is_vector_type_p (const_tree fntype, bool check_only)
 {
   tree return_type = TREE_TYPE (fntype);
 
   if (riscv_vector_type_p (return_type))
     {
-      riscv_validate_vector_type (return_type, "return type");
+      if (!check_only)
+	riscv_validate_vector_type (return_type, "return type");
       return true;
     }
   else
@@ -6773,7 +7048,7 @@ riscv_return_value_is_vector_type_p (const_tree fntype)
    RISC-V V registers.  */
 
 static bool
-riscv_arguments_is_vector_type_p (const_tree fntype)
+riscv_arguments_is_vector_type_p (const_tree fntype, bool check_only)
 {
   for (tree chain = TYPE_ARG_TYPES (fntype); chain && chain != void_list_node;
        chain = TREE_CHAIN (chain))
@@ -6781,7 +7056,8 @@ riscv_arguments_is_vector_type_p (const_tree fntype)
       tree arg_type = TREE_VALUE (chain);
       if (riscv_vector_type_p (arg_type))
 	{
-	  riscv_validate_vector_type (arg_type, "argument type");
+	  if (!check_only)
+	    riscv_validate_vector_type (arg_type, "argument type");
 	  return true;
 	}
     }
@@ -6792,14 +7068,15 @@ riscv_arguments_is_vector_type_p (const_tree fntype)
 /* Return true if FUNC is a riscv_vector_cc function.
    For more details please reference the below link.
    https://github.com/riscv-non-isa/riscv-c-api-doc/pull/67 */
+
 static bool
-riscv_vector_cc_function_p (const_tree fntype)
+riscv_vector_cc_function_p (const_tree fntype, bool check_only)
 {
   tree attr = TYPE_ATTRIBUTES (fntype);
   bool vector_cc_p = lookup_attribute ("vector_cc", attr) != NULL_TREE
     || lookup_attribute ("riscv_vector_cc", attr) != NULL_TREE;
 
-  if (vector_cc_p && !TARGET_VECTOR)
+  if (vector_cc_p && !TARGET_VECTOR && !check_only)
     error_at (input_location,
 	      "function attribute %qs requires the V ISA extension",
 	      "riscv_vector_cc");
@@ -6807,26 +7084,91 @@ riscv_vector_cc_function_p (const_tree fntype)
   return vector_cc_p;
 }
 
-/* Implement TARGET_FNTYPE_ABI.  */
+/* Return the riscv_cc value according to the attribute arguments.
+   If the attribute arguments are invalid, return RISCV_CC_UNKNOWN
+   and emit an error message.  */
+
+static riscv_cc
+riscv_get_vls_cc_attr (const_tree args, bool check_only = false)
+{
+  /* Default ABI_VLEN is 128.  */
+  int abi_vlen = 128;
+
+  if (args && TREE_CODE (args) == TREE_LIST)
+    {
+      tree vlen_arg = TREE_VALUE (args);
+      if (vlen_arg && TREE_CODE (vlen_arg) == INTEGER_CST)
+	abi_vlen = TREE_INT_CST_LOW (vlen_arg);
+    }
+
+  if (!riscv_valid_abi_vlen_vls_cc_p (abi_vlen) && !check_only)
+    {
+      error_at (input_location,
+		"unsupported %<ABI_VLEN%> value %d for %qs attribute;"
+		"%<ABI_VLEN must%> be in the range [32, 16384] and must be "
+		"a power of 2",
+		abi_vlen, "riscv_vls_cc");
+      return RISCV_CC_UNKNOWN;
+    }
+
+  return riscv_get_riscv_cc_by_abi_vlen (abi_vlen);
+}
+
+/* Return true if FUNC is a riscv_vector_cc function.
+   For more details please reference the below link.
+   https://github.com/riscv-non-isa/riscv-c-api-doc/pull/67 */
+static riscv_cc
+riscv_vls_cc_function_abi (const_tree fntype, bool check_only)
+{
+  tree attr = TYPE_ATTRIBUTES (fntype);
+  bool vls_cc_p = lookup_attribute ("vls_cc", attr) != NULL_TREE
+		  || lookup_attribute ("riscv_vls_cc", attr) != NULL_TREE;
+
+  if (!vls_cc_p)
+    return RISCV_CC_UNKNOWN;
+
+  if (!TARGET_VECTOR && !check_only)
+    error_at (input_location,
+	      "function attribute %qs requires the vector ISA extension",
+	      "riscv_vls_cc");
+
+  tree args = TREE_VALUE (attr);
+  return riscv_get_vls_cc_attr (args);
+}
+
+/* Implemention of TARGET_FNTYPE_ABI, but one extra parameter `check_only`
+   to suppress warning message.  */
 
 static const predefined_function_abi &
-riscv_fntype_abi (const_tree fntype)
+riscv_fntype_abi_1 (const_tree fntype, bool check_only)
 {
   /* Implement the vector calling convention.  For more details please
      reference the below link.
      https://github.com/riscv-non-isa/riscv-elf-psabi-doc/pull/389  */
   bool validate_v_abi_p = false;
 
-  validate_v_abi_p |= riscv_return_value_is_vector_type_p (fntype);
-  validate_v_abi_p |= riscv_arguments_is_vector_type_p (fntype);
-  validate_v_abi_p |= riscv_vector_cc_function_p (fntype);
+  validate_v_abi_p |= riscv_return_value_is_vector_type_p (fntype, check_only);
+  validate_v_abi_p |= riscv_arguments_is_vector_type_p (fntype, check_only);
+  validate_v_abi_p |= riscv_vector_cc_function_p (fntype, check_only);
 
   if (validate_v_abi_p)
-    return riscv_v_abi ();
+    return riscv_v_abi (RISCV_CC_V);
+
+  riscv_cc abi = riscv_vls_cc_function_abi (fntype, check_only);
+  if (abi != RISCV_CC_UNKNOWN)
+    return riscv_v_abi (abi);
 
   return default_function_abi;
 }
 
+/* Implement TARGET_FNTYPE_ABI.  */
+
+static const predefined_function_abi &
+riscv_fntype_abi (const_tree fntype)
+{
+  return riscv_fntype_abi_1 (fntype, /* check_only */true);
+}
+
 /* Return riscv calling convention of call_insn.  */
 riscv_cc
 get_riscv_cc (const rtx use)
@@ -6916,6 +7258,25 @@ riscv_handle_type_attribute (tree *node ATTRIBUTE_UNUSED, tree name, tree args,
 }
 
 static tree
+riscv_handle_rvv_vls_cc_attribute (tree *, tree name, tree args,
+				   ATTRIBUTE_UNUSED int flags,
+				   bool *no_add_attrs)
+{
+  bool vls_cc_p = is_attribute_p ("vls_cc", name)
+		  || is_attribute_p ("riscv_vls_cc", name);
+
+  if (!vls_cc_p)
+    return NULL_TREE;
+
+  riscv_cc cc = riscv_get_vls_cc_attr (args);
+
+  if (cc == RISCV_CC_UNKNOWN)
+    *no_add_attrs = true;
+
+  return NULL_TREE;
+}
+
+static tree
 riscv_handle_rvv_vector_bits_attribute (tree *node, tree name, tree args,
 					ATTRIBUTE_UNUSED int flags,
 					bool *no_add_attrs)
@@ -10215,6 +10576,71 @@ riscv_issue_rate (void)
   return tune_param->issue_rate;
 }
 
+/* Structure for very basic vector configuration tracking in the scheduler.  */
+struct last_vconfig
+{
+  bool valid;
+  bool ta;
+  bool ma;
+  uint8_t sew;
+  uint8_t vlmul;
+  rtx avl;
+} last_vconfig;
+
+/* Clear LAST_VCONFIG so we have no known state.  */
+static void
+clear_vconfig (void)
+{
+  memset (&last_vconfig, 0, sizeof (last_vconfig));
+}
+
+/* Return TRUE if INSN is a vector insn needing a particular
+   vector configuration that is trivially equal to the last
+   vector insn issued.  Return FALSE otherwise.  */
+static bool
+compatible_with_last_vconfig (rtx_insn *insn)
+{
+  /* We might be able to extract the data from a preexisting vsetvl.  */
+  if (vsetvl_insn_p (insn))
+    return false;
+
+  /* Nothing to do for these cases.  */
+  if (!NONDEBUG_INSN_P (insn) || !has_vtype_op (insn))
+    return false;
+
+  extract_insn_cached (insn);
+
+  rtx avl = get_avl (insn);
+  if (avl != last_vconfig.avl)
+    return false;
+
+  if (get_sew (insn) != last_vconfig.sew)
+    return false;
+
+  if (get_vlmul (insn) != last_vconfig.vlmul)
+    return false;
+
+  if (tail_agnostic_p (insn) != last_vconfig.ta)
+    return false;
+
+  if (mask_agnostic_p (insn) != last_vconfig.ma)
+    return false;
+
+  /* No differences found, they're trivially compatible.  */
+  return true;
+}
+
+/* Implement TARGET_SCHED_INIT, we use this to track the vector configuration
+   of the last issued vector instruction.  We can then use that information
+   to potentially adjust the ready queue to issue instructions of a compatible
+   vector configuration instead of a conflicting configuration.  That will
+   reduce the number of vsetvl instructions we ultimately emit.  */
+static void
+riscv_sched_init (FILE *, int, int)
+{
+  clear_vconfig ();
+}
+
 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
 static int
 riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
@@ -10239,9 +10665,88 @@ riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
      an assert so we can find and fix this problem.  */
   gcc_assert (insn_has_dfa_reservation_p (insn));
 
+  /* If this is a vector insn with vl/vtype info, then record the last
+     vector configuration.  */
+  if (vsetvl_insn_p (insn))
+    clear_vconfig ();
+  else if (NONDEBUG_INSN_P (insn) && has_vtype_op (insn))
+    {
+      extract_insn_cached (insn);
+
+      rtx avl = get_avl (insn);
+      if (avl == RVV_VLMAX)
+       avl = const0_rtx;
+
+      if (!avl || !CONST_INT_P (avl))
+       clear_vconfig ();
+      else
+	{
+	  last_vconfig.valid = true;
+	  last_vconfig.avl = avl;
+	  last_vconfig.sew = get_sew (insn);
+	  last_vconfig.vlmul = get_vlmul (insn);
+	  last_vconfig.ta = tail_agnostic_p (insn);
+	  last_vconfig.ma = mask_agnostic_p (insn);
+	}
+    }
+
   return more - 1;
 }
 
+/* Implement TARGET_SCHED_REORDER.  The goal here is to look at the ready
+   queue and reorder it ever so slightly to encourage issing an insn with
+   the same vector configuration as the most recently issued vector
+   instruction.  That will reduce vsetvl instructions.  */
+static int
+riscv_sched_reorder (FILE *, int, rtx_insn **ready, int *nreadyp, int)
+{
+  /* If we don't have a valid prior vector configuration, then there is
+     no point in reordering the ready queue, similarly if there is
+     just one entry in the queue.  */
+  if (!last_vconfig.valid || *nreadyp == 1)
+    return riscv_issue_rate ();
+
+  return riscv_issue_rate ();
+  int nready = *nreadyp;
+  int priority = INSN_PRIORITY (ready[nready - 1]);
+  for (int i = nready - 1; i >= 0; i--)
+    {
+      rtx_insn *insn = ready[i];
+
+      /* On a high performance core, vsetvl instructions should be
+	 inexpensive.  Removing them is very much a secondary concern, so
+	 be extremely conservative with reordering, essentially only
+	 allowing reordering within the highest priority value.
+
+	 Lower end cores may benefit from more flexibility here.  That
+	 tuning is left to those who understand their core's behavior
+	 and can thoroughly benchmark the result.  Assuming such
+	 designs appear, we can probably put an entry in the tuning
+	 structure to indicate how much difference in priority to allow.  */
+      if (INSN_PRIORITY (insn) < priority)
+	break;
+
+      if (compatible_with_last_vconfig (insn))
+	{
+	  /* This entry is compatible with the last vconfig and has
+	     the same priority as the most important insn.  So swap
+	     it so that we keep the vector configuration as-is and
+	     ultimately eliminate a vsetvl.
+
+	     Note no need to swap if this is the first entry in the
+	     queue.  */
+	  if (i == nready - 1)
+	    break;
+
+	  std::swap (ready[i], ready[nready - 1]);
+	  break;
+	}
+    }
+
+  return riscv_issue_rate ();
+}
+
+
 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
    instruction fusion of some sort.  */
 
@@ -11082,7 +11587,7 @@ riscv_asm_output_variant_cc (FILE *stream, const tree decl, const char *name)
   if (TREE_CODE (decl) == FUNCTION_DECL)
     {
       riscv_cc cc = (riscv_cc) fndecl_abi (decl).id ();
-      if (cc == RISCV_CC_V)
+      if (cc == RISCV_CC_V || riscv_vls_cc_p (cc))
 	{
 	  fprintf (stream, "\t.variant_cc\t");
 	  assemble_name (stream, name);
@@ -15650,9 +16155,15 @@ riscv_prefetch_offset_address_p (rtx x, machine_mode mode)
 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
 #define TARGET_SCHED_MACRO_FUSION_PAIR_P riscv_macro_fusion_pair_p
 
+#undef TARGET_SCHED_INIT
+#define TARGET_SCHED_INIT riscv_sched_init
+
 #undef  TARGET_SCHED_VARIABLE_ISSUE
 #define TARGET_SCHED_VARIABLE_ISSUE riscv_sched_variable_issue
 
+#undef  TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER riscv_sched_reorder
+
 #undef  TARGET_SCHED_ADJUST_COST
 #define TARGET_SCHED_ADJUST_COST riscv_sched_adjust_cost
 
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 9146571..a0ad75c 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -779,6 +779,17 @@ enum riscv_cc
 {
   RISCV_CC_BASE = 0, /* Base standard RISC-V ABI.  */
   RISCV_CC_V, /* For functions that pass or return values in V registers.  */
+  /* For functions that pass or return values in V registers.  */
+  RISCV_CC_VLS_V_32,
+  RISCV_CC_VLS_V_64,
+  RISCV_CC_VLS_V_128,
+  RISCV_CC_VLS_V_256,
+  RISCV_CC_VLS_V_512,
+  RISCV_CC_VLS_V_1024,
+  RISCV_CC_VLS_V_2048,
+  RISCV_CC_VLS_V_4096,
+  RISCV_CC_VLS_V_8192,
+  RISCV_CC_VLS_V_16384,
   RISCV_CC_UNKNOWN
 };
 
@@ -786,6 +797,8 @@ typedef struct {
   /* The calling convention that current function used.  */
   enum riscv_cc variant_cc;
 
+  unsigned int abi_vlen;
+
   /* Number of integer registers used so far, up to MAX_ARGS_IN_REGISTERS. */
   unsigned int num_gprs;
 
@@ -809,7 +822,7 @@ extern enum riscv_cc get_riscv_cc (const rtx use);
 
 #define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, INDIRECT, N_NAMED_ARGS) \
   riscv_init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (INDIRECT),     \
-			(N_NAMED_ARGS) != -1)
+			(N_NAMED_ARGS) != -1, /* check_only */false)
 
 #define EPILOGUE_USES(REGNO)	riscv_epilogue_uses (REGNO)
 
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 3cb87bf..9d34725 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1437,6 +1437,8 @@
   [(set_attr "type" "vlde,vste,vmov")
    (set_attr "mode" "<MODE>")
    (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE))
+   (set (attr "has_vl_op") (const_string "false"))
+   (set (attr "has_vtype_op") (const_string "false"))
    (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE))
    (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))]
 )
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 374288d..c713451 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -43,6 +43,7 @@
   UNSPEC_FRAME_BLOCKAGE
   UNSPEC_CEIL
   UNSPEC_FLOOR
+  UNSPEC_ROUND
 ])
 
 (define_c_enum "unspecv" [
@@ -104,8 +105,11 @@
 
 ;; This iterator and attribute allow FP-to-integer rounding of two types
 ;; to be generated from one template.
-(define_int_iterator ANY_ROUND [UNSPEC_CEIL UNSPEC_FLOOR])
-(define_int_attr m_round [(UNSPEC_CEIL "ceil") (UNSPEC_FLOOR "floor")])
+(define_int_iterator ANY_ROUND [UNSPEC_CEIL UNSPEC_FLOOR UNSPEC_ROUND])
+(define_int_attr m_round [(UNSPEC_CEIL "ceil") (UNSPEC_FLOOR "floor")
+			  (UNSPEC_ROUND "round")])
+(define_int_attr c_round [(UNSPEC_CEIL "1") (UNSPEC_FLOOR "1")
+			  (UNSPEC_ROUND "flag_unsafe_math_optimizations")])
 
 
 ;; Attributes.
@@ -680,35 +684,26 @@
    (set_attr "mode"	"SI")
    (set_attr "length"	"3")])
 
-(define_insn_and_split "one_cmplsi2"
-  [(set (match_operand:SI 0 "register_operand" "=a")
-	(not:SI (match_operand:SI 1 "register_operand" "r")))]
+(define_expand "one_cmplsi2"
+  [(set (match_operand:SI 0 "register_operand")
+	(not:SI (match_operand:SI 1 "register_operand")))]
   ""
-  "#"
-  "&& can_create_pseudo_p ()"
-  [(set (match_dup 2)
-	(const_int -1))
-   (set (match_dup 0)
-	(xor:SI (match_dup 1)
-		(match_dup 2)))]
 {
-  operands[2] = gen_reg_rtx (SImode);
-}
-  [(set_attr "type"	"arith")
-   (set_attr "mode"	"SI")
-   (set (attr "length")
-	(if_then_else (match_test "TARGET_DENSITY")
-		      (const_int 5)
-		      (const_int 6)))])
+  emit_insn (gen_xorsi3 (operands[0], operands[1],
+			 force_reg (SImode, constm1_rtx)));
+  DONE;
+})
 
 (define_insn "negsf2"
-  [(set (match_operand:SF 0 "register_operand" "=f")
-	(neg:SF (match_operand:SF 1 "register_operand" "f")))]
+  [(set (match_operand:SF 0 "register_operand")
+        (neg:SF (match_operand:SF 1 "register_operand")))
+   (clobber (match_scratch:SI 2))]
   "TARGET_HARD_FLOAT"
-  "neg.s\t%0, %1"
-  [(set_attr "type"	"farith")
-   (set_attr "mode"	"SF")
-   (set_attr "length"	"3")])
+  {@ [cons: =0, 1, =2; attrs: type, length]
+     [D, D, &a; arith , 7] movi.n\t%2, 1\;slli\t%2, %2, 31\;add.n\t%0, %1, %2
+     [f, f,  X; farith, 3] neg.s\t%0, %1
+  }
+  [(set_attr "mode" "SF")])
 
 
 ;; Logical instructions.
@@ -1150,7 +1145,7 @@
 (define_insn "*fix<s_fix>_truncsfsi2_scaled"
   [(set (match_operand:SI 0 "register_operand" "=a")
 	(any_fix:SI (mult:SF (match_operand:SF 1 "register_operand" "f")
-			     (match_operand:SF 2 "fix_scaling_operand" "F"))))]
+			     (match_operand:SF 2 "fix_scaling_operand" ""))))]
   "TARGET_HARD_FLOAT"
   "<m_fix>.s\t%0, %1, %U2"
   [(set_attr "type"	"fconv")
@@ -1169,7 +1164,7 @@
 (define_insn "*float<s_float>sisf2_scaled"
   [(set (match_operand:SF 0 "register_operand" "=f")
 	(mult:SF (any_float:SF (match_operand:SI 1 "register_operand" "a"))
-		 (match_operand:SF 2 "float_scaling_operand" "F")))]
+		 (match_operand:SF 2 "float_scaling_operand" "")))]
   "TARGET_HARD_FLOAT"
   "<m_float>.s\t%0, %1, %V2"
   [(set_attr "type"	"fconv")
@@ -1179,7 +1174,7 @@
 (define_insn "l<m_round>sfsi2"
   [(set (match_operand:SI 0 "register_operand" "=a")
 	(unspec:SI [(match_operand:SF 1 "register_operand" "f")] ANY_ROUND))]
-  "TARGET_HARD_FLOAT"
+  "TARGET_HARD_FLOAT && <c_round>"
   "<m_round>.s\t%0, %1, 0"
   [(set_attr "type"	"fconv")
    (set_attr "mode"	"SF")
@@ -1189,7 +1184,7 @@
   [(set (match_operand:SI 0 "register_operand" "=a")
 	(unspec:SI [(plus:SF (match_operand:SF 1 "register_operand" "f")
 			     (match_dup 1))] ANY_ROUND))]
-  "TARGET_HARD_FLOAT"
+  "TARGET_HARD_FLOAT && <c_round>"
   "<m_round>.s\t%0, %1, 1"
   [(set_attr "type"	"fconv")
    (set_attr "mode"	"SF")
@@ -1198,8 +1193,8 @@
 (define_insn "*l<m_round>sfsi2_scaled"
   [(set (match_operand:SI 0 "register_operand" "=a")
 	(unspec:SI [(mult:SF (match_operand:SF 1 "register_operand" "f")
-			     (match_operand:SF 2 "fix_scaling_operand" "F"))] ANY_ROUND))]
-  "TARGET_HARD_FLOAT"
+			     (match_operand:SF 2 "fix_scaling_operand" ""))] ANY_ROUND))]
+  "TARGET_HARD_FLOAT && <c_round>"
   "<m_round>.s\t%0, %1, %U2"
   [(set_attr "type"	"fconv")
    (set_attr "mode"	"SF")