Merge from trunk revision fba228e259dd5112851527f2dbb62c5601100985.

author: Ian Lance Taylor <iant@golang.org> 2021-10-07 15:28:36 -0700
committer: Ian Lance Taylor <iant@golang.org> 2021-10-07 15:28:36 -0700
commit: 0b6b70a0733672600644c8df96942cda5bf86d3d (patch)
tree: 9a1fbd7f782c54df55ab225ed1be057e3f3b0b8a /gcc/config
parent: a5b5cabc91c38710adbe5c8a2b53882abe994441 (diff)
parent: fba228e259dd5112851527f2dbb62c5601100985 (diff)
download: gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.zip
gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.tar.gz
gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.tar.bz2
46 files changed, 3225 insertions, 314 deletions
diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def
index b749727..a3b32e0 100644
--- a/gcc/config/aarch64/aarch64-arches.def
+++ b/gcc/config/aarch64/aarch64-arches.def
@@ -37,6 +37,8 @@ AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
 AARCH64_ARCH("armv8.4-a",     generic,	     8_4A,	8,  AARCH64_FL_FOR_ARCH8_4)
 AARCH64_ARCH("armv8.5-a",     generic,	     8_5A,	8,  AARCH64_FL_FOR_ARCH8_5)
 AARCH64_ARCH("armv8.6-a",     generic,	     8_6A,	8,  AARCH64_FL_FOR_ARCH8_6)
+AARCH64_ARCH("armv8.7-a",     generic,       8_7A,      8,  AARCH64_FL_FOR_ARCH8_7)
 AARCH64_ARCH("armv8-r",       generic,	     8R  ,	8,  AARCH64_FL_FOR_ARCH8_R)
+AARCH64_ARCH("armv9-a",       generic,	     9A  ,	9,  AARCH64_FL_FOR_ARCH9)
 
 #undef AARCH64_ARCH
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 119f67d..1a507ea 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -182,6 +182,10 @@ static enum aarch64_type_qualifiers
 aarch64_types_binopp_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_poly, qualifier_poly, qualifier_poly };
 #define TYPES_BINOPP (aarch64_types_binopp_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_binop_ppu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_poly, qualifier_poly, qualifier_unsigned };
+#define TYPES_BINOP_PPU (aarch64_types_binop_ppu_qualifiers)
 
 static enum aarch64_type_qualifiers
 aarch64_types_ternop_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -207,6 +211,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
       qualifier_unsigned, qualifier_immediate };
 #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)
 static enum aarch64_type_qualifiers
+aarch64_types_ternop_sssu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none, qualifier_unsigned };
+#define TYPES_TERNOP_SSSU (aarch64_types_ternop_sssu_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
 #define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
@@ -214,6 +222,10 @@ static enum aarch64_type_qualifiers
 aarch64_types_ternop_suss_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_unsigned, qualifier_none, qualifier_none };
 #define TYPES_TERNOP_SUSS (aarch64_types_ternop_suss_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_binop_pppu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_poly, qualifier_poly, qualifier_poly, qualifier_unsigned };
+#define TYPES_TERNOP_PPPU (aarch64_types_binop_pppu_qualifiers)
 
 
 static enum aarch64_type_qualifiers
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index b2aa167..77da310 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -162,4 +162,13 @@ AARCH64_CORE("cortex-a76.cortex-a55",  cortexa76cortexa55, cortexa53, 8_2A,  AAR
 /* Armv8-R Architecture Processors.  */
 AARCH64_CORE("cortex-r82", cortexr82, cortexa53, 8R, AARCH64_FL_FOR_ARCH8_R, cortexa53, 0x41, 0xd15, -1)
 
+/* Armv9.0-A Architecture Processors.  */
+
+/* Arm ('A') cores. */
+AARCH64_CORE("cortex-a510",  cortexa510, cortexa55, 9A,  AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, cortexa53, 0x41, 0xd46, -1)
+
+AARCH64_CORE("cortex-a710",  cortexa710, cortexa57, 9A,  AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, neoversen2, 0x41, 0xd47, -1)
+
+AARCH64_CORE("cortex-x2",  cortexx2, cortexa57, 9A,  AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, neoversen2, 0x41, 0xd48, -1)
+
 #undef AARCH64_CORE
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 579328c..b61f1df 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -232,4 +232,7 @@ AARCH64_OPT_EXTENSION("flagm", AARCH64_FL_FLAGM, 0, 0, false, "flagm")
 /* Enabling/Disabling "pauth" only changes "pauth".  */
 AARCH64_OPT_EXTENSION("pauth", AARCH64_FL_PAUTH, 0, 0, false, "paca pacg")
 
+/* Enabling/Disabling "ls64" only changes "ls64".  */
+AARCH64_OPT_EXTENSION("ls64", AARCH64_FL_LS64, 0, 0, false, "")
+
 #undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 402453a..35dc075 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -721,6 +721,8 @@
   /* Implemented by aarch64_qtbl1<mode>.  */
   VAR2 (BINOP, qtbl1, 0, NONE, v8qi, v16qi)
   VAR2 (BINOPU, qtbl1, 0, NONE, v8qi, v16qi)
+  VAR2 (BINOP_PPU, qtbl1, 0, NONE, v8qi, v16qi)
+  VAR2 (BINOP_SSU, qtbl1, 0, NONE, v8qi, v16qi)
 
   /* Implemented by aarch64_qtbl2<mode>.  */
   VAR2 (BINOP, qtbl2, 0, NONE, v8qi, v16qi)
@@ -734,6 +736,8 @@
   /* Implemented by aarch64_qtbx1<mode>.  */
   VAR2 (TERNOP, qtbx1, 0, NONE, v8qi, v16qi)
   VAR2 (TERNOPU, qtbx1, 0, NONE, v8qi, v16qi)
+  VAR2 (TERNOP_PPPU, qtbx1, 0, NONE, v8qi, v16qi)
+  VAR2 (TERNOP_SSSU, qtbx1, 0, NONE, v8qi, v16qi)
 
   /* Implemented by aarch64_qtbx2<mode>.  */
   VAR2 (TERNOP, qtbx2, 0, NONE, v8qi, v16qi)
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index e491c29..12be913 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82"
+	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 36519cc..a9a1800 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23390,7 +23390,8 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 }
 
 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false.  */
+   we succeed, otherwise return false, indicating that a libcall to
+   memcpy should be emitted.  */
 
 bool
 aarch64_expand_cpymem (rtx *operands)
@@ -23407,11 +23408,13 @@ aarch64_expand_cpymem (rtx *operands)
 
   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Inline up to 256 bytes when optimizing for speed.  */
+  /* Try to inline up to 256 bytes.  */
   unsigned HOST_WIDE_INT max_copy_size = 256;
 
-  if (optimize_function_for_size_p (cfun))
-    max_copy_size = 128;
+  bool size_p = optimize_function_for_size_p (cfun);
+
+  if (size > max_copy_size)
+    return false;
 
   int copy_bits = 256;
 
@@ -23421,13 +23424,14 @@ aarch64_expand_cpymem (rtx *operands)
       || !TARGET_SIMD
       || (aarch64_tune_params.extra_tuning_flags
 	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-    {
-      copy_bits = 128;
-      max_copy_size = max_copy_size / 2;
-    }
+    copy_bits = 128;
 
-  if (size > max_copy_size)
-    return false;
+  /* Emit an inline load+store sequence and count the number of operations
+     involved.  We use a simple count of just the loads and stores emitted
+     rather than rtx_insn count as all the pointer adjustments and reg copying
+     in this function will get optimized away later in the pipeline.  */
+  start_sequence ();
+  unsigned nops = 0;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -23456,7 +23460,8 @@ aarch64_expand_cpymem (rtx *operands)
 	cur_mode = V4SImode;
 
       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
-
+      /* A single block copy is 1 load + 1 store.  */
+      nops += 2;
       n -= mode_bits;
 
       /* Emit trailing copies using overlapping unaligned accesses - this is
@@ -23471,7 +23476,16 @@ aarch64_expand_cpymem (rtx *operands)
 	  n = n_bits;
 	}
     }
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+     arguments + 1 for the call.  */
+  unsigned libcall_cost = 4;
+  if (size_p && libcall_cost < nops)
+    return false;
 
+  emit_insn (seq);
   return true;
 }
 
@@ -23534,40 +23548,37 @@ aarch64_expand_setmem (rtx *operands)
   if (!CONST_INT_P (operands[1]))
     return false;
 
-  bool speed_p = !optimize_function_for_size_p (cfun);
+  bool size_p = optimize_function_for_size_p (cfun);
 
   /* Default the maximum to 256-bytes.  */
   unsigned max_set_size = 256;
 
-  /* In case we are optimizing for size or if the core does not
-     want to use STP Q regs, lower the max_set_size.  */
-  max_set_size = (!speed_p
-		  || (aarch64_tune_params.extra_tuning_flags
-		      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-		  ? max_set_size / 2 : max_set_size;
-
   len = INTVAL (operands[1]);
 
   /* Upper bound check.  */
   if (len > max_set_size)
     return false;
 
+  /* Attempt a sequence with a vector broadcast followed by stores.
+     Count the number of operations involved to see if it's worth it for
+     code size.  */
+  start_sequence ();
+  unsigned nops = 0;
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
 
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
   src = expand_vector_broadcast (V16QImode, val);
   src = force_reg (V16QImode, src);
-
+  nops++;
   /* Convert len to bits to make the rest of the code simpler.  */
   n = len * BITS_PER_UNIT;
 
   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  setmem expand
      pattern is only turned on for TARGET_SIMD.  */
-  const int copy_limit = (speed_p
-			  && (aarch64_tune_params.extra_tuning_flags
-			      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
+			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
 			  ? GET_MODE_BITSIZE (TImode) : 256;
 
   while (n > 0)
@@ -23583,7 +23594,7 @@ aarch64_expand_setmem (rtx *operands)
 
       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-
+      nops++;
       n -= mode_bits;
 
       /* Do certain trailing copies as overlapping if it's going to be
@@ -23599,7 +23610,15 @@ aarch64_expand_setmem (rtx *operands)
 	  n = n_bits;
 	}
     }
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+  /* A call to memset in the worst case requires 3 instructions to prepare
+     the arguments + 1 for the call.  Prefer the inline sequence for size if
+     it is no longer than that.  */
+  if (size_p && nops > 4)
+    return false;
 
+  emit_insn (seq);
   return true;
 }
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index a5ba6c2..2792bb2 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -231,6 +231,15 @@ extern unsigned aarch64_architecture_version;
 /* Pointer Authentication (PAUTH) extension.  */
 #define AARCH64_FL_PAUTH      (1ULL << 40)
 
+/* 64-byte atomic load/store extensions.  */
+#define AARCH64_FL_LS64      (1ULL << 41)
+
+/* Armv8.7-a architecture extensions.  */
+#define AARCH64_FL_V8_7       (1ULL << 42)
+
+/* Armv9.0-A.  */
+#define AARCH64_FL_V9         (1ULL << 43)  /* Armv9.0-A Architecture.  */
+
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
 
@@ -255,8 +264,13 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_FOR_ARCH8_6			\
   (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_V8_6 | AARCH64_FL_FPSIMD \
    | AARCH64_FL_I8MM | AARCH64_FL_BF16)
+#define AARCH64_FL_FOR_ARCH8_7			\
+  (AARCH64_FL_FOR_ARCH8_6 | AARCH64_FL_V8_7 | AARCH64_FL_LS64)
+
 #define AARCH64_FL_FOR_ARCH8_R     \
   (AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_V8_R)
+#define AARCH64_FL_FOR_ARCH9       \
+  (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_V9)
 
 /* Macros to test ISA flags.  */
 
@@ -295,6 +309,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_SB		   (aarch64_isa_flags & AARCH64_FL_SB)
 #define AARCH64_ISA_V8_R	   (aarch64_isa_flags & AARCH64_FL_V8_R)
 #define AARCH64_ISA_PAUTH	   (aarch64_isa_flags & AARCH64_FL_PAUTH)
+#define AARCH64_ISA_V9		   (aarch64_isa_flags & AARCH64_FL_V9)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 635a223..2d5bf34 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -10416,15 +10416,14 @@ __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl1_p8 (poly8x16_t __tab, uint8x8_t __idx)
 {
-  return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __tab,
-						  (int8x8_t) __idx);
+  return __builtin_aarch64_qtbl1v8qi_ppu (__tab, __idx);
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl1_s8 (int8x16_t __tab, uint8x8_t __idx)
 {
-  return __builtin_aarch64_qtbl1v8qi (__tab, (int8x8_t) __idx);
+  return __builtin_aarch64_qtbl1v8qi_ssu (__tab, __idx);
 }
 
 __extension__ extern __inline uint8x8_t
@@ -10438,15 +10437,14 @@ __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl1q_p8 (poly8x16_t __tab, uint8x16_t __idx)
 {
-  return (poly8x16_t) __builtin_aarch64_qtbl1v16qi ((int8x16_t) __tab,
-						    (int8x16_t) __idx);
+  return __builtin_aarch64_qtbl1v16qi_ppu (__tab, __idx);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbl1q_s8 (int8x16_t __tab, uint8x16_t __idx)
 {
-  return __builtin_aarch64_qtbl1v16qi (__tab, (int8x16_t) __idx);
+  return __builtin_aarch64_qtbl1v16qi_ssu (__tab, __idx);
 }
 
 __extension__ extern __inline uint8x16_t
@@ -10460,7 +10458,7 @@ __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbx1_s8 (int8x8_t __r, int8x16_t __tab, uint8x8_t __idx)
 {
-  return __builtin_aarch64_qtbx1v8qi (__r, __tab, (int8x8_t) __idx);
+  return __builtin_aarch64_qtbx1v8qi_sssu (__r, __tab, __idx);
 }
 
 __extension__ extern __inline uint8x8_t
@@ -10474,16 +10472,14 @@ __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbx1_p8 (poly8x8_t __r, poly8x16_t __tab, uint8x8_t __idx)
 {
-  return (poly8x8_t) __builtin_aarch64_qtbx1v8qi ((int8x8_t) __r,
-						  (int8x16_t) __tab,
-						  (int8x8_t) __idx);
+  return __builtin_aarch64_qtbx1v8qi_pppu (__r, __tab, __idx);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbx1q_s8 (int8x16_t __r, int8x16_t __tab, uint8x16_t __idx)
 {
-  return __builtin_aarch64_qtbx1v16qi (__r, __tab, (int8x16_t) __idx);
+  return __builtin_aarch64_qtbx1v16qi_sssu (__r, __tab, __idx);
 }
 
 __extension__ extern __inline uint8x16_t
@@ -10497,9 +10493,7 @@ __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vqtbx1q_p8 (poly8x16_t __r, poly8x16_t __tab, uint8x16_t __idx)
 {
-  return (poly8x16_t) __builtin_aarch64_qtbx1v16qi ((int8x16_t) __r,
-						    (int8x16_t) __tab,
-						    (int8x16_t) __idx);
+  return __builtin_aarch64_qtbx1v16qi_pppu (__r, __tab, __idx);
 }
 
 /* V7 legacy table intrinsics.  */
@@ -10528,8 +10522,7 @@ vtbl1_p8 (poly8x8_t __tab, uint8x8_t __idx)
 {
   poly8x16_t __temp = vcombine_p8 (__tab,
 				   vcreate_p8 (__AARCH64_UINT64_C (0x0)));
-  return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __temp,
-						  (int8x8_t) __idx);
+  return __builtin_aarch64_qtbl1v8qi_ppu (__temp, __idx);
 }
 
 __extension__ extern __inline int8x8_t
@@ -10553,8 +10546,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vtbl2_p8 (poly8x8x2_t __tab, uint8x8_t __idx)
 {
   poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]);
-  return (poly8x8_t) __builtin_aarch64_qtbl1v8qi ((int8x16_t) __temp,
-						  (int8x8_t) __idx);
+  return __builtin_aarch64_qtbl1v8qi_ppu (__temp, __idx);
 }
 
 __extension__ extern __inline int8x8_t
@@ -10653,9 +10645,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx2_p8 (poly8x8_t __r, poly8x8x2_t __tab, uint8x8_t __idx)
 {
   poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]);
-  return (poly8x8_t) __builtin_aarch64_qtbx1v8qi ((int8x8_t) __r,
-						  (int8x16_t) __temp,
-						  (int8x8_t) __idx);
+  return __builtin_aarch64_qtbx1v8qi_pppu (__r, __temp, __idx);
 }
 
 /* End of temporary inline asm.  */
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index bcc9ebe..d0d0d0f 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -1612,6 +1612,16 @@ begin cpu cortex-r52
  part d13
 end cpu cortex-r52
 
+begin cpu cortex-r52plus
+ cname cortexr52plus
+ tune flags LDSCHED
+ architecture armv8-r+crc+simd
+ option nofp.dp remove FP_DBL ALL_SIMD
+ costs cortex
+ vendor 41
+ part d16
+end cpu cortex-r52plus
+
 # FPU entries
 # format:
 # begin fpu <name>
diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
index 5692d4f..8bb0c9f 100644
--- a/gcc/config/arm/arm-tables.opt
+++ b/gcc/config/arm/arm-tables.opt
@@ -282,6 +282,9 @@ Enum(processor_type) String(cortex-m55) Value( TARGET_CPU_cortexm55)
 EnumValue
 Enum(processor_type) String(cortex-r52) Value( TARGET_CPU_cortexr52)
 
+EnumValue
+Enum(processor_type) String(cortex-r52plus) Value( TARGET_CPU_cortexr52plus)
+
 Enum
 Name(arm_arch) Type(int)
 Known ARM architectures (for use with the -march= option):
diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md
index b9df864..6482833 100644
--- a/gcc/config/arm/arm-tune.md
+++ b/gcc/config/arm/arm-tune.md
@@ -49,5 +49,5 @@
 	cortexx1,neoversen1,cortexa75cortexa55,
 	cortexa76cortexa55,neoversev1,neoversen2,
 	cortexm23,cortexm33,cortexm35p,
-	cortexm55,cortexr52"
+	cortexm55,cortexr52,cortexr52plus"
 	(const (symbol_ref "((enum attr_tune) arm_tune)")))
diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h
index 50524a5..0fa1c57 100644
--- a/gcc/config/darwin.h
+++ b/gcc/config/darwin.h
@@ -251,7 +251,7 @@ extern GTY(()) int darwin_ms_struct;
     %{v} \
     %{g*:%{!gctf:%{!gbtf:%{!gstabs*:%{%:debug-level-gt(0): -idsym}}}}}\
     %{.c|.cc|.C|.cpp|.cp|.c++|.cxx|.CPP|.m|.mm|.s|.f|.f90|\
-      .f95|.f03|.f77|.for|.F|.F90|.F95|.F03: \
+      .f95|.f03|.f77|.for|.F|.F90|.F95|.F03|.d: \
     %{g*:%{!gctf:%{!gbtf:%{!gstabs*:%{%:debug-level-gt(0): -dsym}}}}}}}}}}}}}"
 
 #define LINK_COMMAND_SPEC LINK_COMMAND_SPEC_A DSYMUTIL_SPEC
diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h
index fc99c8d..6a432d1 100644
--- a/gcc/config/gcn/gcn-hsa.h
+++ b/gcc/config/gcn/gcn-hsa.h
@@ -75,25 +75,66 @@ extern unsigned int gcn_local_sym_hash (const char *name);
    supported for gcn.  */
 #define GOMP_SELF_SPECS ""
 
+#ifdef HAVE_GCN_XNACK_FIJI
+#define X_FIJI
+#else
+#define X_FIJI "!march=*:;march=fiji:;"
+#endif
+#ifdef HAVE_GCN_XNACK_GFX900
+#define X_900
+#else
+#define X_900 "march=gfx900:;"
+#endif
+#ifdef HAVE_GCN_XNACK_GFX906
+#define X_906
+#else
+#define X_906 "march=gfx906:;"
+#endif
+#ifdef HAVE_GCN_XNACK_GFX908
+#define X_908
+#else
+#define X_908 "march=gfx908:;"
+#endif
+
 #ifdef HAVE_GCN_SRAM_ECC_FIJI
-#define A_FIJI
+#define S_FIJI
 #else
-#define A_FIJI "!march=*:;march=fiji:;"
+#define S_FIJI "!march=*:;march=fiji:;"
 #endif
 #ifdef HAVE_GCN_SRAM_ECC_GFX900
-#define A_900
+#define S_900
 #else
-#define A_900 "march=gfx900:;"
+#define S_900 "march=gfx900:;"
 #endif
 #ifdef HAVE_GCN_SRAM_ECC_GFX906
-#define A_906
+#define S_906
 #else
-#define A_906 "march=gfx906:;"
+#define S_906 "march=gfx906:;"
 #endif
 #ifdef HAVE_GCN_SRAM_ECC_GFX908
-#define A_908
+#define S_908
+#else
+#define S_908 "march=gfx908:;"
+#endif
+
+#ifdef HAVE_GCN_ASM_V3_SYNTAX
+#define SRAMOPT "!msram-ecc=off:-mattr=+sram-ecc;:-mattr=-sram-ecc"
+#endif
+#ifdef HAVE_GCN_ASM_V4_SYNTAX
+/* In HSACOv4 no attribute setting means the binary supports "any" hardware
+   configuration.  The name of the attribute also changed.  */
+#define SRAMOPT "msram-ecc=on:-mattr=+sramecc;msram-ecc=off:-mattr=-sramecc"
+#endif
+#if !defined(SRAMOPT) && !defined(IN_LIBGCC2)
+#error "No assembler syntax configured"
+#endif
+
+#ifdef HAVE_GCN_ASM_V4_SYNTAX
+/* FIJI cards don't seem to support drivers new enough to allow HSACOv4.  */
+#define HSACO3_SELECT_OPT \
+    "%{!march=*|march=fiji:--amdhsa-code-object-version=3} "
 #else
-#define A_908 "march=gfx908:;"
+#define HSACO3_SELECT_OPT
 #endif
 
 /* These targets can't have SRAM-ECC, even if a broken assembler allows it.  */
@@ -103,10 +144,10 @@ extern unsigned int gcn_local_sym_hash (const char *name);
 /* Use LLVM assembler and linker options.  */
 #define ASM_SPEC  "-triple=amdgcn--amdhsa "  \
 		  "%:last_arg(%{march=*:-mcpu=%*}) " \
-		  "-mattr=%{mxnack:+xnack;:-xnack} " \
-		  /* FIXME: support "any" when we move to HSACOv4.  */ \
-		  "-mattr=%{" A_FIJI A_900 A_906 A_908 \
-			    "!msram-ecc=off:+sram-ecc;:-sram-ecc} " \
+		  HSACO3_SELECT_OPT \
+		  "%{" X_FIJI X_900 X_906 X_908 \
+		     "mxnack:-mattr=+xnack;:-mattr=-xnack} " \
+		  "%{" S_FIJI S_900 S_906 S_908 SRAMOPT "} " \
 		  "-filetype=obj"
 #define LINK_SPEC "--pie --export-dynamic"
 #define LIB_SPEC  "-lc"
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 84ff675..01fdce6 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -827,8 +827,12 @@
 	/* Work around assembler bug in which a 64-bit register is expected,
 	but a 32-bit value would be correct.  */
 	int reg = REGNO (operands[2]) - FIRST_VGPR_REG;
-	sprintf (buf, "global_load%%o0\t%%0, v[%d:%d], %%1 offset:%%3%s\;"
-		      "s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
+	if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
+	  sprintf (buf, "global_load%%o0\t%%0, v%d, %%1 offset:%%3%s\;"
+			"s_waitcnt\tvmcnt(0)", reg, glc);
+	else
+	  sprintf (buf, "global_load%%o0\t%%0, v[%d:%d], %%1 offset:%%3%s\;"
+			"s_waitcnt\tvmcnt(0)", reg, reg + 1, glc);
       }
     else
       gcc_unreachable ();
@@ -958,8 +962,12 @@
 	/* Work around assembler bug in which a 64-bit register is expected,
 	but a 32-bit value would be correct.  */
 	int reg = REGNO (operands[1]) - FIRST_VGPR_REG;
-	sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s",
-		 reg, reg + 1, glc);
+	if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
+	  sprintf (buf, "global_store%%s3\tv%d, %%3, %%0 offset:%%2%s",
+		   reg, glc);
+	else
+	  sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s",
+		   reg, reg + 1, glc);
       }
     else
       gcc_unreachable ();
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index 2a3fc96..2e90f32 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -5217,42 +5217,76 @@ static void
 output_file_start (void)
 {
   const char *cpu;
-  bool use_sram = flag_sram_ecc;
+  bool use_xnack_attr = true;
+  bool use_sram_attr = true;
   switch (gcn_arch)
     {
     case PROCESSOR_FIJI:
       cpu = "gfx803";
+#ifndef HAVE_GCN_XNACK_FIJI
+      use_xnack_attr = false;
+#endif
 #ifndef HAVE_GCN_SRAM_ECC_FIJI
-      use_sram = false;
+      use_sram_attr = false;
 #endif
       break;
     case PROCESSOR_VEGA10:
       cpu = "gfx900";
+#ifndef HAVE_GCN_XNACK_GFX900
+      use_xnack_attr = false;
+#endif
 #ifndef HAVE_GCN_SRAM_ECC_GFX900
-      use_sram = false;
+      use_sram_attr = false;
 #endif
       break;
     case PROCESSOR_VEGA20:
       cpu = "gfx906";
+#ifndef HAVE_GCN_XNACK_GFX906
+      use_xnack_attr = false;
+#endif
 #ifndef HAVE_GCN_SRAM_ECC_GFX906
-      use_sram = false;
+      use_sram_attr = false;
 #endif
       break;
     case PROCESSOR_GFX908:
       cpu = "gfx908";
+#ifndef HAVE_GCN_XNACK_GFX908
+      use_xnack_attr = false;
+#endif
 #ifndef HAVE_GCN_SRAM_ECC_GFX908
-      use_sram = false;
+      use_sram_attr = false;
 #endif
       break;
     default: gcc_unreachable ();
     }
 
+#if HAVE_GCN_ASM_V3_SYNTAX
   const char *xnack = (flag_xnack ? "+xnack" : "");
-  /* FIXME: support "any" when we move to HSACOv4.  */
-  const char *sram_ecc = (use_sram ? "+sram-ecc" : "");
+  const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : "");
+#endif
+#if HAVE_GCN_ASM_V4_SYNTAX
+  /* In HSACOv4 no attribute setting means the binary supports "any" hardware
+     configuration.  In GCC binaries, this is true for SRAM ECC, but not
+     XNACK.  */
+  const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-");
+  const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+"
+			  : flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-"
+			  : "");
+#endif
+  if (!use_xnack_attr)
+    xnack = "";
+  if (!use_sram_attr)
+    sram_ecc = "";
 
   fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
-	  cpu, xnack, sram_ecc);
+	  cpu,
+#if HAVE_GCN_ASM_V3_SYNTAX
+	  xnack, sram_ecc
+#endif
+#ifdef HAVE_GCN_ASM_V4_SYNTAX
+	  sram_ecc, xnack
+#endif
+	  );
 }
 
 /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c
index 732bdfd..a3b22d0 100644
--- a/gcc/config/gcn/mkoffload.c
+++ b/gcc/config/gcn/mkoffload.c
@@ -54,8 +54,51 @@
 #undef  EF_AMDGPU_MACH_AMDGCN_GFX908
 #define EF_AMDGPU_MACH_AMDGCN_GFX908 0x30
 
-#define EF_AMDGPU_XNACK    0x100
-#define EF_AMDGPU_SRAM_ECC 0x200
+#define EF_AMDGPU_XNACK_V3    0x100
+#define EF_AMDGPU_SRAM_ECC_V3 0x200
+
+#define EF_AMDGPU_FEATURE_XNACK_V4	0x300  /* Mask.  */
+#define EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4	0x000
+#define EF_AMDGPU_FEATURE_XNACK_ANY_V4	0x100
+#define EF_AMDGPU_FEATURE_XNACK_OFF_V4	0x200
+#define EF_AMDGPU_FEATURE_XNACK_ON_V4	0x300
+
+#define EF_AMDGPU_FEATURE_SRAMECC_V4	0xc00  /* Mask.  */
+#define EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4	0x000
+#define EF_AMDGPU_FEATURE_SRAMECC_ANY_V4	0x400
+#define EF_AMDGPU_FEATURE_SRAMECC_OFF_V4	0x800
+#define EF_AMDGPU_FEATURE_SRAMECC_ON_V4		0xc00
+
+#ifdef HAVE_GCN_ASM_V3_SYNTAX
+#define SET_XNACK_ON(VAR) VAR |= EF_AMDGPU_XNACK_V3
+#define SET_XNACK_OFF(VAR) VAR &= ~EF_AMDGPU_XNACK_V3
+#define TEST_XNACK(VAR) (VAR & EF_AMDGPU_XNACK_V3)
+
+#define SET_SRAM_ECC_ON(VAR) VAR |= EF_AMDGPU_SRAM_ECC_V3
+#define SET_SRAM_ECC_ANY(VAR) SET_SRAM_ECC_ON (VAR)
+#define SET_SRAM_ECC_OFF(VAR) VAR &= ~EF_AMDGPU_SRAM_ECC_V3
+#define TEST_SRAM_ECC_ANY(VAR) 0 /* Not supported.  */
+#define TEST_SRAM_ECC_ON(VAR) (VAR & EF_AMDGPU_SRAM_ECC_V3)
+#endif
+#ifdef HAVE_GCN_ASM_V4_SYNTAX
+#define SET_XNACK_ON(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_XNACK_V4) \
+				 | EF_AMDGPU_FEATURE_XNACK_ON_V4)
+#define SET_XNACK_OFF(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_XNACK_V4) \
+				  | EF_AMDGPU_FEATURE_XNACK_OFF_V4)
+#define TEST_XNACK(VAR) ((VAR & EF_AMDGPU_FEATURE_XNACK_V4) \
+			 == EF_AMDGPU_FEATURE_XNACK_ON_V4)
+
+#define SET_SRAM_ECC_ON(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_SRAMECC_V4) \
+				    | EF_AMDGPU_FEATURE_SRAMECC_ON_V4)
+#define SET_SRAM_ECC_ANY(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_SRAMECC_V4) \
+				     | EF_AMDGPU_FEATURE_SRAMECC_ANY_V4)
+#define SET_SRAM_ECC_OFF(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_SRAMECC_V4) \
+				     | EF_AMDGPU_FEATURE_SRAMECC_OFF_V4)
+#define TEST_SRAM_ECC_ANY(VAR) ((VAR & EF_AMDGPU_FEATURE_SRAMECC_V4) \
+				== EF_AMDGPU_FEATURE_SRAMECC_ANY_V4)
+#define TEST_SRAM_ECC_ON(VAR) ((VAR & EF_AMDGPU_FEATURE_SRAMECC_V4) \
+			       == EF_AMDGPU_FEATURE_SRAMECC_ON_V4)
+#endif
 
 #ifndef R_AMDGPU_NONE
 #define R_AMDGPU_NONE		0
@@ -80,7 +123,13 @@ static struct obstack files_to_cleanup;
 
 enum offload_abi offload_abi = OFFLOAD_ABI_UNSET;
 uint32_t elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX803;  // Default GPU architecture.
-uint32_t elf_flags = 0;
+uint32_t elf_flags =
+#ifdef HAVE_GCN_ASM_V3_SYNTAX
+    0;
+#endif
+#ifdef HAVE_GCN_ASM_V4_SYNTAX
+    (EF_AMDGPU_FEATURE_XNACK_ANY_V4 | EF_AMDGPU_FEATURE_SRAMECC_ANY_V4);
+#endif
 
 /* Delete tempfiles.  */
 
@@ -851,23 +900,22 @@ main (int argc, char **argv)
       else if (strcmp (argv[i], "-fpic") == 0)
 	fpic = true;
       else if (strcmp (argv[i], "-mxnack") == 0)
-	elf_flags |= EF_AMDGPU_XNACK;
+	SET_XNACK_ON (elf_flags);
       else if (strcmp (argv[i], "-mno-xnack") == 0)
-	elf_flags &= ~EF_AMDGPU_XNACK;
+	SET_XNACK_OFF (elf_flags);
       else if (strcmp (argv[i], "-msram-ecc=on") == 0)
 	{
-	  elf_flags |= EF_AMDGPU_SRAM_ECC;
+	  SET_SRAM_ECC_ON (elf_flags);
 	  sram_seen = true;
 	}
       else if (strcmp (argv[i], "-msram-ecc=any") == 0)
 	{
-	  /* FIXME: change this when we move to HSACOv4.  */
-	  elf_flags |= EF_AMDGPU_SRAM_ECC;
+	  SET_SRAM_ECC_ANY (elf_flags);
 	  sram_seen = true;
 	}
       else if (strcmp (argv[i], "-msram-ecc=off") == 0)
 	{
-	  elf_flags &= ~EF_AMDGPU_SRAM_ECC;
+	  SET_SRAM_ECC_OFF (elf_flags);
 	  sram_seen = true;
 	}
       else if (strcmp (argv[i], "-save-temps") == 0)
@@ -890,23 +938,27 @@ main (int argc, char **argv)
   if (!(fopenacc ^ fopenmp))
     fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
 
-  /* The SRAM-ECC feature defaults to "any" on GPUs where the feature is
-     available.  */
   if (!sram_seen)
-    switch (elf_arch)
-      {
-      case EF_AMDGPU_MACH_AMDGCN_GFX803:
-      case EF_AMDGPU_MACH_AMDGCN_GFX900:
-      case EF_AMDGPU_MACH_AMDGCN_GFX906:
+    {
+#ifdef HAVE_GCN_ASM_V3_SYNTAX
+      /* For HSACOv3, the SRAM-ECC feature defaults to "on" on GPUs where the
+	 feature is available.
+	 (HSACOv4 has elf_flags initialsed to "any" in all cases.)  */
+      switch (elf_arch)
+	{
+	case EF_AMDGPU_MACH_AMDGCN_GFX803:
+	case EF_AMDGPU_MACH_AMDGCN_GFX900:
+	case EF_AMDGPU_MACH_AMDGCN_GFX906:
 #ifndef HAVE_GCN_SRAM_ECC_GFX908
-      case EF_AMDGPU_MACH_AMDGCN_GFX908:
+	case EF_AMDGPU_MACH_AMDGCN_GFX908:
 #endif
-	break;
-      default:
-	/* FIXME: change this when we move to HSACOv4.  */
-	elf_flags |= EF_AMDGPU_SRAM_ECC;
-	break;
-      }
+	  break;
+	default:
+	  SET_SRAM_ECC_ON (elf_flags);
+	  break;
+	}
+#endif
+    }
 
   const char *abi;
   switch (offload_abi)
@@ -936,11 +988,12 @@ main (int argc, char **argv)
   if (fopenmp)
     obstack_ptr_grow (&cc_argv_obstack, "-mgomp");
   obstack_ptr_grow (&cc_argv_obstack,
-		    (elf_flags & EF_AMDGPU_XNACK
+		    (TEST_XNACK (elf_flags)
 		     ? "-mxnack" : "-mno-xnack"));
   obstack_ptr_grow (&cc_argv_obstack,
-		    (elf_flags & EF_AMDGPU_SRAM_ECC
-		     ? "-msram-ecc=on" : "-msram-ecc=off"));
+		    (TEST_SRAM_ECC_ON (elf_flags) ? "-msram-ecc=on"
+		     : TEST_SRAM_ECC_ANY (elf_flags) ? "-msram-ecc=any"
+		     : "-msram-ecc=off"));
 
   for (int ix = 1; ix != argc; ix++)
     {
@@ -1043,11 +1096,12 @@ main (int argc, char **argv)
       obstack_ptr_grow (&ld_argv_obstack, gcn_s2_name);
       obstack_ptr_grow (&ld_argv_obstack, "-lgomp");
       obstack_ptr_grow (&ld_argv_obstack,
-			(elf_flags & EF_AMDGPU_XNACK
+			(TEST_XNACK (elf_flags)
 			 ? "-mxnack" : "-mno-xnack"));
       obstack_ptr_grow (&ld_argv_obstack,
-			(elf_flags & EF_AMDGPU_SRAM_ECC
-			 ? "-msram-ecc=on" : "-msram-ecc=off"));
+			(TEST_SRAM_ECC_ON (elf_flags) ? "-msram-ecc=on"
+			 : TEST_SRAM_ECC_ANY (elf_flags) ? "-msram-ecc=any"
+			 : "-msram-ecc=off"));
       if (verbose)
 	obstack_ptr_grow (&ld_argv_obstack, "-v");
 
diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h
index 4714696..29cf679 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -45,6 +45,14 @@ typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__));
 typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__));
 typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__));
 
+/* Unaligned version of the same type.  */
+typedef _Float16 __m128h_u __attribute__ ((__vector_size__ (16),	\
+					   __may_alias__, __aligned__ (1)));
+typedef _Float16 __m256h_u __attribute__ ((__vector_size__ (32),	\
+					   __may_alias__, __aligned__ (1)));
+typedef _Float16 __m512h_u __attribute__ ((__vector_size__ (64),	\
+					   __may_alias__, __aligned__ (1)));
+
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5,
@@ -362,6 +370,48 @@ _mm_load_sh (void const *__P)
 		     *(_Float16 const *) __P);
 }
 
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_ph (void const *__P)
+{
+  return *(const __m512h *) __P;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_ph (void const *__P)
+{
+  return *(const __m256h *) __P;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ph (void const *__P)
+{
+  return *(const __m128h *) __P;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_ph (void const *__P)
+{
+  return *(const __m512h_u *) __P;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_ph (void const *__P)
+{
+  return *(const __m256h_u *) __P;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ph (void const *__P)
+{
+  return *(const __m128h_u *) __P;
+}
+
 /* Stores the lower _Float16 value.  */
 extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -370,6 +420,56 @@ _mm_store_sh (void *__P, __m128h __A)
   *(_Float16 *) __P = ((__v8hf)__A)[0];
 }
 
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_ph (void *__P, __m512h __A)
+{
+   *(__m512h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_ph (void *__P, __m256h __A)
+{
+   *(__m256h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ph (void *__P, __m128h __A)
+{
+   *(__m128h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_ph (void *__P, __m512h __A)
+{
+   *(__m512h_u *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_ph (void *__P, __m256h __A)
+{
+   *(__m256h_u *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ph (void *__P, __m128h __A)
+{
+   *(__m128h_u *) __P = __A;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_ph (__m512h __A)
+{
+  return (__m512h) _mm512_and_epi32 ( _mm512_set1_epi32 (0x7FFF7FFF),
+				      (__m512i) __A);
+}
+
 /* Intrinsics v[add,sub,mul,div]ph.  */
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -621,6 +721,33 @@ _mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
 					       (A), (D)))
 #endif  /* __OPTIMIZE__  */
 
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_conj_pch (__m512h __A)
+{
+  return (__m512h) _mm512_xor_epi32 ((__m512i) __A, _mm512_set1_epi32 (1<<31));
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_conj_pch (__m512h __W, __mmask16 __U, __m512h __A)
+{
+  return (__m512h)
+    __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A),
+				   (__v16sf) __W,
+				   (__mmask16) __U);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_conj_pch (__mmask16 __U, __m512h __A)
+{
+  return (__m512h)
+    __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A),
+				   (__v16sf) _mm512_setzero_ps (),
+				   (__mmask16) __U);
+}
+
 /* Intrinsics of v[add,sub,mul,div]sh.  */
 extern __inline __m128h
   __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -6115,6 +6242,1006 @@ _mm_maskz_fnmsub_round_sh (__mmask8 __U, __m128h __W, __m128h __A,
 
 #endif /* __OPTIMIZE__ */
 
+/* Intrinsics vf[,c]maddcph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A,
+					(__v32hf) __B,
+					(__v32hf) __C,
+					_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+  return (__m512h) __builtin_ia32_movaps512_mask
+    ((__v16sf)
+     __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+					      (__v32hf) __C,
+					      (__v32hf) __D, __B,
+					      _MM_FROUND_CUR_DIRECTION),
+     (__v16sf) __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+					     (__v32hf) __B,
+					     (__v32hf) __C,
+					     __D, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B,
+					      (__v32hf) __C,
+					      (__v32hf) __D,
+					      __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_pch (__m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_round ((__v32hf) __A,
+				       (__v32hf) __B,
+				       (__v32hf) __C,
+				       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+  return (__m512h) __builtin_ia32_movaps512_mask
+    ((__v16sf)
+     __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+					     (__v32hf) __C,
+					     (__v32hf) __D, __B,
+					     _MM_FROUND_CUR_DIRECTION),
+     (__v16sf) __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+					    (__v32hf) __B,
+					    (__v32hf) __C,
+					    __D, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B,
+					     (__v32hf) __C,
+					     (__v32hf) __D,
+					     __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A,
+					(__v32hf) __B,
+					(__v32hf) __C,
+					__D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+			      __m512h __D, const int __E)
+{
+  return (__m512h) __builtin_ia32_movaps512_mask
+    ((__v16sf)
+     __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+					      (__v32hf) __C,
+					      (__v32hf) __D, __B,
+					      __E),
+     (__v16sf) __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C,
+			       __mmask16 __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+					     (__v32hf) __B,
+					     (__v32hf) __C,
+					     __D, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C,
+			       __m512h __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B,
+					      (__v32hf) __C,
+					      (__v32hf) __D,
+					      __A, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_round ((__v32hf) __A,
+				       (__v32hf) __B,
+				       (__v32hf) __C,
+				       __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+			     __m512h __D, const int __E)
+{
+  return (__m512h) __builtin_ia32_movaps512_mask
+    ((__v16sf)
+     __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+					     (__v32hf) __C,
+					     (__v32hf) __D, __B,
+					     __E),
+     (__v16sf) __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C,
+			      __mmask16 __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+					    (__v32hf) __B,
+					    (__v32hf) __C,
+					    __D, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C,
+			      __m512h __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B,
+					     (__v32hf) __C,
+					     (__v32hf) __D,
+					     __A, __E);
+}
+
+#else
+#define _mm512_fcmadd_round_pch(A, B, C, D)			\
+  (__m512h) __builtin_ia32_vfcmaddcph512_round ((A), (B), (C), (D))
+
+#define _mm512_mask_fcmadd_round_pch(A, B, C, D, E)			\
+  ((__m512h) __builtin_ia32_movaps512_mask (				\
+   (__v16sf)								\
+    __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) (A),		\
+					     (__v32hf) (C),		\
+					     (__v32hf) (D),		\
+					     (B), (E)),			\
+					     (__v16sf) (A), (B)));
+
+
+#define _mm512_mask3_fcmadd_round_pch(A, B, C, D, E)			\
+  ((__m512h)								\
+   __builtin_ia32_vfcmaddcph512_mask_round ((A), (B), (C), (D), (E)))
+
+#define _mm512_maskz_fcmadd_round_pch(A, B, C, D, E)			\
+  (__m512h)								\
+   __builtin_ia32_vfcmaddcph512_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm512_fmadd_round_pch(A, B, C, D)			\
+  (__m512h) __builtin_ia32_vfmaddcph512_round ((A), (B), (C), (D))
+
+#define _mm512_mask_fmadd_round_pch(A, B, C, D, E)			\
+  ((__m512h) __builtin_ia32_movaps512_mask (				\
+   (__v16sf)								\
+    __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) (A),		\
+					    (__v32hf) (C),		\
+					    (__v32hf) (D),		\
+					    (B), (E)),			\
+					    (__v16sf) (A), (B)));
+
+#define _mm512_mask3_fmadd_round_pch(A, B, C, D, E)			\
+  (__m512h)								\
+   __builtin_ia32_vfmaddcph512_mask_round ((A), (B), (C), (D), (E))
+
+#define _mm512_maskz_fmadd_round_pch(A, B, C, D, E)			\
+  (__m512h)								\
+   __builtin_ia32_vfmaddcph512_maskz_round ((B), (C), (D), (A), (E))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]mulcph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmul_pch (__m512h __A, __m512h __B)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_round ((__v32hf) __A,
+				       (__v32hf) __B,
+				       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C,
+					    (__v32hf) __D,
+					    (__v32hf) __A,
+					    __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmul_pch (__mmask16 __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B,
+					    (__v32hf) __C,
+					    _mm512_setzero_ph (),
+					    __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmul_pch (__m512h __A, __m512h __B)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_round ((__v32hf) __A,
+				      (__v32hf) __B,
+				      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C,
+					   (__v32hf) __D,
+					   (__v32hf) __A,
+					   __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmul_pch (__mmask16 __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B,
+					   (__v32hf) __C,
+					   _mm512_setzero_ph (),
+					   __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmul_round_pch (__m512h __A, __m512h __B, const int __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_round ((__v32hf) __A,
+				       (__v32hf) __B, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+			     __m512h __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C,
+					    (__v32hf) __D,
+					    (__v32hf) __A,
+					    __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmul_round_pch (__mmask16 __A, __m512h __B,
+			      __m512h __C, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B,
+					    (__v32hf) __C,
+					    _mm512_setzero_ph (),
+					    __A, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmul_round_pch (__m512h __A, __m512h __B, const int __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_round ((__v32hf) __A,
+				      (__v32hf) __B,
+				      __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+			    __m512h __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C,
+					   (__v32hf) __D,
+					   (__v32hf) __A,
+					   __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmul_round_pch (__mmask16 __A, __m512h __B,
+			     __m512h __C, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B,
+					   (__v32hf) __C,
+					   _mm512_setzero_ph (),
+					   __A, __E);
+}
+
+#else
+#define _mm512_fcmul_round_pch(A, B, D)				\
+  (__m512h) __builtin_ia32_vfcmulcph512_round ((A), (B), (D))
+
+#define _mm512_mask_fcmul_round_pch(A, B, C, D, E)			\
+  (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((C), (D), (A), (B), (E))
+
+#define _mm512_maskz_fcmul_round_pch(A, B, C, E)			\
+  (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((B), (C),		\
+						    (__v32hf)		\
+						    _mm512_setzero_ph (), \
+						    (A), (E))
+
+#define _mm512_fmul_round_pch(A, B, D)			\
+  (__m512h) __builtin_ia32_vfmulcph512_round ((A), (B), (D))
+
+#define _mm512_mask_fmul_round_pch(A, B, C, D, E)			  \
+  (__m512h) __builtin_ia32_vfmulcph512_mask_round ((C), (D), (A), (B), (E))
+
+#define _mm512_maskz_fmul_round_pch(A, B, C, E)				  \
+  (__m512h) __builtin_ia32_vfmulcph512_mask_round ((B), (C),		  \
+						   (__v32hf)		  \
+						   _mm512_setzero_ph (),  \
+						   (A), (E))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]maddcsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+#ifdef __AVX512VL__
+  return (__m128h) __builtin_ia32_movaps128_mask (
+    (__v4sf)
+    __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+					  (__v8hf) __C,
+					  (__v8hf) __D, __B,
+					  _MM_FROUND_CUR_DIRECTION),
+    (__v4sf) __A, __B);
+#else
+  return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A,
+    (__v4sf)
+    __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+					  (__v8hf) __C,
+					  (__v8hf) __D, __B,
+					  _MM_FROUND_CUR_DIRECTION),
+    (__v4sf) _mm_set_ss ((float) ((int) __B << 31)));
+#endif
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+  return (__m128h) _mm_move_ss ((__m128) __C,
+    (__m128)
+    __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+					  (__v8hf) __B,
+					  (__v8hf) __C, __D,
+					  _MM_FROUND_CUR_DIRECTION));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B,
+					   (__v8hf) __C,
+					   (__v8hf) __D,
+					   __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A,
+				     (__v8hf) __B,
+				     (__v8hf) __C,
+				     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+#ifdef __AVX512VL__
+  return (__m128h) __builtin_ia32_movaps128_mask (
+    (__v4sf)
+    __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+					 (__v8hf) __C,
+					 (__v8hf) __D, __B,
+					 _MM_FROUND_CUR_DIRECTION),
+    (__v4sf) __A, __B);
+#else
+  return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A,
+    (__v4sf)
+    __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+					 (__v8hf) __C,
+					 (__v8hf) __D, __B,
+					 _MM_FROUND_CUR_DIRECTION),
+    (__v4sf) _mm_set_ss ((float) ((int) __B << 31)));
+#endif
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+  return (__m128h) _mm_move_ss ((__m128) __C,
+    (__m128)
+    __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+					 (__v8hf) __B,
+					 (__v8hf) __C, __D,
+					 _MM_FROUND_CUR_DIRECTION));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B,
+					  (__v8hf) __C,
+					  (__v8hf) __D,
+					  __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_sch (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_round ((__v8hf) __A,
+				    (__v8hf) __B,
+				    (__v8hf) __C,
+				    _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+			   __m128h __D, const int __E)
+{
+#ifdef __AVX512VL__
+  return (__m128h) __builtin_ia32_movaps128_mask (
+    (__v4sf)
+    __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+					  (__v8hf) __C,
+					  (__v8hf) __D,
+					  __B, __E),
+    (__v4sf) __A, __B);
+#else
+  return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A,
+    (__v4sf)
+    __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+					  (__v8hf) __C,
+					  (__v8hf) __D,
+					  __B, __E),
+    (__v4sf) _mm_set_ss ((float) ((int) __B << 31)));
+#endif
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C,
+			    __mmask8 __D, const int __E)
+{
+  return (__m128h) _mm_move_ss ((__m128) __C,
+    (__m128)
+    __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+					  (__v8hf) __B,
+					  (__v8hf) __C,
+					  __D, __E));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+			    __m128h __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B,
+					   (__v8hf) __C,
+					   (__v8hf) __D,
+					   __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A,
+				     (__v8hf) __B,
+				     (__v8hf) __C,
+				     __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+			  __m128h __D, const int __E)
+{
+#ifdef __AVX512VL__
+  return (__m128h) __builtin_ia32_movaps128_mask (
+    (__v4sf)
+    __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+					 (__v8hf) __C,
+					 (__v8hf) __D,
+					 __B, __E),
+    (__v4sf) __A, __B);
+#else
+  return (__m128h) __builtin_ia32_blendvps ((__v4sf) __A,
+    (__v4sf)
+    __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+					 (__v8hf) __C,
+					 (__v8hf) __D,
+					 __B, __E),
+    (__v4sf) _mm_set_ss ((float) ((int) __B << 31)));
+#endif
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C,
+			   __mmask8 __D, const int __E)
+{
+  return (__m128h) _mm_move_ss ((__m128) __C,
+    (__m128)
+    __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+					 (__v8hf) __B,
+					 (__v8hf) __C,
+					 __D, __E));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+			   __m128h __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B,
+					  (__v8hf) __C,
+					  (__v8hf) __D,
+					  __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_round ((__v8hf) __A,
+				    (__v8hf) __B,
+				    (__v8hf) __C,
+				    __D);
+}
+#else
+#ifdef __AVX512VL__
+#define _mm_mask_fcmadd_round_sch(A, B, C, D, E)			\
+    ((__m128h) __builtin_ia32_movaps128_mask (				\
+     (__v4sf)								\
+     __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A),		\
+					   (__v8hf) (C),		\
+					   (__v8hf) (D),		\
+					   (B), (E)),			\
+					   (__v4sf) (A), (B)))
+
+#else
+#define _mm_mask_fcmadd_round_sch(A, B, C, D, E)			\
+  ((__m128h) __builtin_ia32_blendvps ((__v4sf) (A),			\
+   (__v4sf)								\
+   __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A),		\
+					 (__v8hf) (C),		\
+					 (__v8hf) (D),		\
+					 (B), (E)),		\
+    (__v4sf) _mm_set_ss ((float) ((int) (B) << 31))))
+#endif
+
+#define _mm_mask3_fcmadd_round_sch(A, B, C, D, E)			\
+  ((__m128h) _mm_move_ss ((__m128) (C),					\
+   (__m128)								\
+   __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A),		\
+					 (__v8hf) (B),		\
+					 (__v8hf) (C),		\
+					 (D), (E))))
+
+#define _mm_maskz_fcmadd_round_sch(A, B, C, D, E)		\
+  __builtin_ia32_vfcmaddcsh_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm_fcmadd_round_sch(A, B, C, D)		\
+  __builtin_ia32_vfcmaddcsh_round ((A), (B), (C), (D))
+
+#ifdef __AVX512VL__
+#define _mm_mask_fmadd_round_sch(A, B, C, D, E)				\
+    ((__m128h) __builtin_ia32_movaps128_mask (				\
+     (__v4sf)								\
+     __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A),		\
+					  (__v8hf) (C),		\
+					  (__v8hf) (D),		\
+					  (B), (E)),		\
+					  (__v4sf) (A), (B)))
+
+#else
+#define _mm_mask_fmadd_round_sch(A, B, C, D, E)				\
+  ((__m128h) __builtin_ia32_blendvps ((__v4sf) (A),			\
+   (__v4sf)								\
+   __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A),		\
+					(__v8hf) (C),		\
+					(__v8hf) (D),		\
+					(B), (E)),		\
+    (__v4sf) _mm_set_ss ((float) ((int) (B) << 31))))
+#endif
+
+#define _mm_mask3_fmadd_round_sch(A, B, C, D, E)			\
+  ((__m128h) _mm_move_ss ((__m128) (C),					\
+   (__m128)								\
+   __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A),		\
+					(__v8hf) (B),		\
+					(__v8hf) (C),		\
+					(D), (E))))
+
+#define _mm_maskz_fmadd_round_sch(A, B, C, D, E)		\
+  __builtin_ia32_vfmaddcsh_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm_fmadd_round_sch(A, B, C, D)		\
+  __builtin_ia32_vfmaddcsh_round ((A), (B), (C), (D))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]mulcsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_sch (__m128h __A, __m128h __B)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_round ((__v8hf) __A,
+				    (__v8hf) __B,
+				    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C,
+					 (__v8hf) __D,
+					 (__v8hf) __A,
+					 __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_sch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B,
+					 (__v8hf) __C,
+					 _mm_setzero_ph (),
+					 __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_sch (__m128h __A, __m128h __B)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_round ((__v8hf) __A,
+				   (__v8hf) __B,
+				   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C,
+					(__v8hf) __D,
+					(__v8hf) __A,
+					__B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_sch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B,
+					(__v8hf) __C,
+					_mm_setzero_ph (),
+					__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_round_sch (__m128h __A, __m128h __B, const int __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_round ((__v8hf) __A,
+				    (__v8hf) __B,
+				    __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+			  __m128h __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C,
+					 (__v8hf) __D,
+					 (__v8hf) __A,
+					 __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+			   const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B,
+					 (__v8hf) __C,
+					 _mm_setzero_ph (),
+					 __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_round_sch (__m128h __A, __m128h __B, const int __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_round ((__v8hf) __A,
+				   (__v8hf) __B, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+			 __m128h __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C,
+					(__v8hf) __D,
+					(__v8hf) __A,
+					__B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B,
+					(__v8hf) __C,
+					_mm_setzero_ph (),
+					__A, __E);
+}
+
+#else
+#define _mm_fcmul_round_sch(__A, __B, __D)				\
+  (__m128h) __builtin_ia32_vfcmulcsh_round ((__v8hf) __A,		\
+					    (__v8hf) __B, __D)
+
+#define _mm_mask_fcmul_round_sch(__A, __B, __C, __D, __E)		\
+  (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C,		\
+						 (__v8hf) __D,		\
+						 (__v8hf) __A,		\
+						 __B, __E)
+
+#define _mm_maskz_fcmul_round_sch(__A, __B, __C, __E)			\
+  (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B,		\
+						 (__v8hf) __C,		\
+						 _mm_setzero_ph (),	\
+						 __A, __E)
+
+#define _mm_fmul_round_sch(__A, __B, __D)				\
+  (__m128h) __builtin_ia32_vfmulcsh_round ((__v8hf) __A,		\
+					   (__v8hf) __B, __D)
+
+#define _mm_mask_fmul_round_sch(__A, __B, __C, __D, __E)		\
+  (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C,		\
+						(__v8hf) __D,		\
+						(__v8hf) __A,		\
+						__B, __E)
+
+#define _mm_maskz_fmul_round_sch(__A, __B, __C, __E)			\
+  (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B,		\
+						(__v8hf) __C,		\
+						_mm_setzero_ph (),	\
+						__A, __E)
+
+#endif /* __OPTIMIZE__ */
+
+#define _MM512_REDUCE_OP(op)						\
+  __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0);	\
+  __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1);	\
+  __m256h __T3 = (__T1 op __T2);					\
+  __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0);	\
+  __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1);	\
+  __m128h __T6 = (__T4 op __T5);					\
+  __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6,		\
+		 (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 });			\
+  __m128h __T8 = (__T6 op __T7);					\
+  __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8,		\
+		 (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 });			\
+  __m128h __T10 = __T8 op __T9;					\
+  return __T10[0] op __T10[1]
+
+// TODO reduce
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_ph (__m512h __A)
+{
+   _MM512_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_ph (__m512h __A)
+{
+   _MM512_REDUCE_OP (*);
+}
+
+#undef _MM512_REDUCE_OP
+
+#ifdef __AVX512VL__
+
+#define _MM512_REDUCE_OP(op)						\
+  __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0);	\
+  __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1);	\
+  __m256h __T3 = __builtin_ia32_##op##ph256_mask (__T1, __T2,		\
+		 _mm256_setzero_ph (), (__mmask16) -1);		\
+  __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0);	\
+  __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1);	\
+  __m128h __T6 = __builtin_ia32_##op##ph128_mask			\
+		 (__T4, __T5, _mm_setzero_ph (),(__mmask8) -1);	\
+  __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6,		\
+		 (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 });			\
+  __m128h __T8 = (__m128h)  __builtin_ia32_##op##ph128_mask		\
+		 (__T6, __T7, _mm_setzero_ph (),(__mmask8) -1);	\
+  __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8,		\
+		 (__v8hi) { 4, 5 });					\
+  __m128h __T10 = __builtin_ia32_##op##ph128_mask			\
+		  (__T8, __T9, _mm_setzero_ph (),(__mmask8) -1);	\
+  __m128h __T11 = (__m128h) __builtin_shuffle (__T10,			\
+		  (__v8hi) { 1, 0 });					\
+  __m128h __T12 = __builtin_ia32_##op##ph128_mask			\
+		  (__T10, __T11, _mm_setzero_ph (),(__mmask8) -1);	\
+  return __T12[0]
+
+#else
+
+#define _MM512_REDUCE_OP(op)						\
+  __m512h __T1 = (__m512h) __builtin_shuffle ((__m512d) __A,		\
+		 (__v8di) { 4, 5, 6, 7, 0, 0, 0, 0 });			\
+  __m512h __T2 = _mm512_##op##_ph (__A, __T1);				\
+  __m512h __T3 = (__m512h) __builtin_shuffle ((__m512d) __T2,		\
+		 (__v8di) { 2, 3, 0, 0, 0, 0, 0, 0 });			\
+  __m512h __T4 = _mm512_##op##_ph (__T2, __T3);			\
+  __m512h __T5 = (__m512h) __builtin_shuffle ((__m512d) __T4,		\
+		 (__v8di) { 1, 0, 0, 0, 0, 0, 0, 0 });			\
+  __m512h __T6 = _mm512_##op##_ph (__T4, __T5);			\
+  __m512h __T7 = (__m512h) __builtin_shuffle ((__m512) __T6,		\
+		 (__v16si) { 1, 0, 0, 0, 0, 0, 0, 0,			\
+			     0, 0, 0, 0, 0, 0, 0, 0 });		\
+  __m512h __T8 = _mm512_##op##_ph (__T6, __T7);			\
+  __m512h __T9 = (__m512h) __builtin_shuffle (__T8,			\
+		 (__v32hi) { 1, 0, 0, 0, 0, 0, 0, 0,			\
+			     0, 0, 0, 0, 0, 0, 0, 0,			\
+			     0, 0, 0, 0, 0, 0, 0, 0,			\
+			     0, 0, 0, 0, 0, 0, 0, 0 });		\
+  __m512h __T10 = _mm512_##op##_ph (__T8, __T9);			\
+  return __T10[0]
+#endif
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_ph (__m512h __A)
+{
+  _MM512_REDUCE_OP (min);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_ph (__m512h __A)
+{
+  _MM512_REDUCE_OP (max);
+}
+
+#undef _MM512_REDUCE_OP
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_ph (__mmask32 __U, __m512h __A, __m512h __W)
+{
+  return (__m512h) __builtin_ia32_movdquhi512_mask ((__v32hi) __W,
+						    (__v32hi) __A,
+						    (__mmask32) __U);
+
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_ph (__m512h __A, __m512i __I, __m512h __B)
+{
+  return (__m512h) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+						       (__v32hi) __I,
+						       (__v32hi) __B,
+						       (__mmask32)-1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_ph (__m512i __A, __m512h __B)
+{
+  return (__m512h) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+						     (__v32hi) __A,
+						     (__v32hi)
+						     (_mm512_setzero_ph ()),
+						     (__mmask32)-1);
+}
+
 #ifdef __DISABLE_AVX512FP16__
 #undef __DISABLE_AVX512FP16__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h
index 1292c02..3d3de96 100644
--- a/gcc/config/i386/avx512fp16vlintrin.h
+++ b/gcc/config/i386/avx512fp16vlintrin.h
@@ -151,6 +151,59 @@ _mm256_zextph128_ph256 (__m128h __A)
 					 (__m128) __A, 0);
 }
 
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_conj_pch (__m256h __A)
+{
+  return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_set1_epi32 (1<<31));
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_conj_pch (__m256h __W, __mmask8 __U, __m256h __A)
+{
+  return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf)
+						   _mm256_conj_pch (__A),
+						  (__v8sf) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_conj_pch (__mmask8 __U, __m256h __A)
+{
+  return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf)
+						   _mm256_conj_pch (__A),
+						  (__v8sf)
+						   _mm256_setzero_ps (),
+						  (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_conj_pch (__m128h __A)
+{
+  return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_set1_epi32 (1<<31));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_conj_pch (__m128h __W, __mmask8 __U, __m128h __A)
+{
+  return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A),
+						  (__v4sf) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_conj_pch (__mmask8 __U, __m128h __A)
+{
+  return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A),
+						  (__v4sf) _mm_setzero_ps (),
+						  (__mmask8) __U);
+}
+
 /* Intrinsics v[add,sub,mul,div]ph.  */
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -425,6 +478,22 @@ _mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C)
 				       _mm256_setzero_ph (), __A);
 }
 
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_ph (__m128h __A)
+{
+  return (__m128h) _mm_and_si128 ( _mm_set1_epi32 (0x7FFF7FFF),
+				   (__m128i) __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_ph (__m256h __A)
+{
+  return (__m256h) _mm256_and_si256 ( _mm256_set1_epi32 (0x7FFF7FFF),
+				      (__m256i) __A);
+}
+
 /* vcmpph */
 #ifdef __OPTIMIZE
 extern __inline __mmask8
@@ -2815,6 +2884,437 @@ _mm_maskz_fnmsub_ph (__mmask8 __U, __m128h __A, __m128h __B,
 							__U);
 }
 
+/* Intrinsics vf[,c]maddcph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_pch (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmaddcph128 ((__v8hf) __A,
+						(__v8hf) __B,
+						(__v8hf) __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h) __builtin_ia32_movaps128_mask
+    ((__v4sf)
+     __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A,
+				       (__v8hf) __C,
+				       (__v8hf) __D, __B),
+     (__v4sf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_pch (__m128h __A, __m128h __B, __m128h __C,  __mmask8 __D)
+{
+  return (__m128h) __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A,
+						     (__v8hf) __B,
+						     (__v8hf) __C, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+  return (__m128h) __builtin_ia32_vfmaddcph128_maskz ((__v8hf) __B,
+						      (__v8hf) __C,
+						      (__v8hf) __D, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_pch (__m256h __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256 ((__v16hf) __A,
+						(__v16hf) __B,
+						(__v16hf) __C);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+  return (__m256h) __builtin_ia32_movaps256_mask
+    ((__v8sf)
+     __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A,
+				       (__v16hf) __C,
+				       (__v16hf) __D, __B),
+     (__v8sf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_pch (__m256h __A, __m256h __B, __m256h __C,  __mmask8 __D)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A,
+						     (__v16hf) __B,
+						     (__v16hf) __C, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D)
+{
+  return (__m256h)__builtin_ia32_vfmaddcph256_maskz ((__v16hf) __B,
+						     (__v16hf) __C,
+						     (__v16hf) __D, __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfcmaddcph128 ((__v8hf) __A,
+						 (__v8hf) __B,
+						 (__v8hf) __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h) __builtin_ia32_movaps128_mask
+    ((__v4sf)
+     __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A,
+					(__v8hf) __C,
+					(__v8hf) __D, __B),
+     (__v4sf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C,  __mmask8 __D)
+{
+  return (__m128h) __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A,
+						      (__v8hf) __B,
+						      (__v8hf) __C, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)__builtin_ia32_vfcmaddcph128_maskz ((__v8hf) __B,
+						      (__v8hf) __C,
+						      (__v8hf) __D, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256 ((__v16hf) __A,
+						 (__v16hf) __B,
+						 (__v16hf) __C);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+  return (__m256h) __builtin_ia32_movaps256_mask
+    ((__v8sf)
+     __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A,
+					(__v16hf) __C,
+					(__v16hf) __D, __B),
+     (__v8sf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C,  __mmask8 __D)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A,
+						      (__v16hf) __B,
+						      (__v16hf) __C, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_maskz ((__v16hf) __B,
+						       (__v16hf) __C,
+						       (__v16hf) __D, __A);
+}
+
+/* Intrinsics vf[,c]mulcph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_pch (__m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmulcph128 ((__v8hf) __A, (__v8hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __C,
+						    (__v8hf) __D,
+						    (__v8hf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_pch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __B,
+						    (__v8hf) __C,
+						    _mm_setzero_ph (),
+						    __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmul_pch (__m256h __A, __m256h __B)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256 ((__v16hf) __A,
+					       (__v16hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __C,
+						    (__v16hf) __D,
+						    (__v16hf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __B,
+						    (__v16hf) __C,
+						    _mm256_setzero_ph (),
+						    __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_pch (__m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfcmulcph128 ((__v8hf) __A,
+						(__v8hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __C,
+						     (__v8hf) __D,
+						     (__v8hf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_pch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __B,
+						     (__v8hf) __C,
+						     _mm_setzero_ph (),
+						     __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmul_pch (__m256h __A, __m256h __B)
+{
+  return (__m256h) __builtin_ia32_vfcmulcph256 ((__v16hf) __A, (__v16hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+  return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __C,
+						     (__v16hf) __D,
+						     (__v16hf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __B,
+						     (__v16hf) __C,
+						     _mm256_setzero_ph (),
+						     __A);
+}
+
+#define _MM256_REDUCE_OP(op)						\
+  __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0);	\
+  __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1);	\
+  __m128h __T3 = (__T1 op __T2);					\
+  __m128h __T4 = (__m128h) __builtin_shuffle (__T3,			\
+		 (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 });			\
+  __m128h __T5 = (__T3) op (__T4);					\
+  __m128h __T6 = (__m128h) __builtin_shuffle (__T5,			\
+		 (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 });			\
+  __m128h __T7 = __T5 op __T6;						\
+  return __T7[0] op __T7[1]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_add_ph (__m256h __A)
+{
+  _MM256_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_mul_ph (__m256h __A)
+{
+  _MM256_REDUCE_OP (*);
+}
+
+#undef _MM256_REDUCE_OP
+#define _MM256_REDUCE_OP(op)						\
+  __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0);	\
+  __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1);	\
+  __m128h __T3 = _mm_##op (__T1, __T2);				\
+  __m128h __T4 = (__m128h) __builtin_shuffle (__T3,			\
+		 (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 });			\
+  __m128h __T5 = _mm_##op (__T3, __T4);				\
+  __m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 4, 5 }); \
+  __m128h __T7 = _mm_##op (__T5, __T6);				\
+  __m128h __T8 = (__m128h) __builtin_shuffle (__T7, (__v8hi) { 1, 0 }); \
+  __m128h __T9 = _mm_##op (__T7, __T8);				\
+  return __T9[0]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_min_ph (__m256h __A)
+{
+  _MM256_REDUCE_OP (min_ph);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_max_ph (__m256h __A)
+{
+  _MM256_REDUCE_OP (max_ph);
+}
+
+#define _MM_REDUCE_OP(op) 						\
+  __m128h __T1 = (__m128h) __builtin_shuffle (__A,			\
+		 (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 });			\
+  __m128h __T2 = (__A) op (__T1);					\
+  __m128h __T3 = (__m128h) __builtin_shuffle (__T2,			\
+		 (__v8hi){ 2, 3, 0, 1, 4, 5, 6, 7 });			\
+  __m128h __T4 = __T2 op __T3;						\
+  return __T4[0] op __T4[1]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_add_ph (__m128h __A)
+{
+  _MM_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_mul_ph (__m128h __A)
+{
+  _MM_REDUCE_OP (*);
+}
+
+#undef _MM_REDUCE_OP
+#define _MM_REDUCE_OP(op) 						\
+  __m128h __T1 = (__m128h) __builtin_shuffle (__A,			\
+		 (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 });			\
+  __m128h __T2 = _mm_##op (__A, __T1);					\
+  __m128h __T3 = (__m128h) __builtin_shuffle (__T2, (__v8hi){ 4, 5 });	\
+  __m128h __T4 = _mm_##op (__T2, __T3);				\
+  __m128h __T5 = (__m128h) __builtin_shuffle (__T4, (__v8hi){ 1, 0 });	\
+  __m128h __T6 = _mm_##op (__T4, __T5);				\
+  return __T6[0]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_min_ph (__m128h __A)
+{
+  _MM_REDUCE_OP (min_ph);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_max_ph (__m128h __A)
+{
+  _MM_REDUCE_OP (max_ph);
+}
+
+#undef _MM256_REDUCE_OP
+#undef _MM_REDUCE_OP
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_ph (__mmask16 __U, __m256h __A, __m256h __W)
+{
+  return (__m256h) __builtin_ia32_movdquhi256_mask ((__v16hi) __W,
+						    (__v16hi) __A,
+						    (__mmask16) __U);
+
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_ph (__m256h __A, __m256i __I, __m256h __B)
+{
+  return (__m256h) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+						       (__v16hi) __I,
+						       (__v16hi) __B,
+						       (__mmask16)-1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_ph (__m256i __A, __m256h __B)
+{
+  return (__m256h) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+						     (__v16hi) __A,
+						     (__v16hi)
+						     (_mm256_setzero_ph ()),
+						     (__mmask16)-1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_ph (__mmask8 __U, __m128h __A, __m128h __W)
+{
+  return (__m128h) __builtin_ia32_movdquhi128_mask ((__v8hi) __W,
+						    (__v8hi) __A,
+						    (__mmask8) __U);
+
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_ph (__m128h __A, __m128i __I, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+						       (__v8hi) __I,
+						       (__v8hi) __B,
+						       (__mmask8)-1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutexvar_ph (__m128i __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+						     (__v8hi) __A,
+						     (__v8hi)
+						     (_mm_setzero_ph ()),
+						     (__mmask8)-1);
+}
+
 #ifdef __DISABLE_AVX512FP16VL__
 #undef __DISABLE_AVX512FP16VL__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 5eae4d0..4c355c5 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1348,6 +1348,7 @@ DEF_FUNCTION_TYPE (V8DI, V8HF, V8DI, UQI, INT)
 DEF_FUNCTION_TYPE (V8DF, V8HF, V8DF, UQI, INT)
 DEF_FUNCTION_TYPE (V8HF, V8DI, V8HF, UQI, INT)
 DEF_FUNCTION_TYPE (V8HF, V8DF, V8HF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF)
 DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI, INT)
 DEF_FUNCTION_TYPE (V8HF, V2DF, V8HF, V8HF, UQI, INT)
 DEF_FUNCTION_TYPE (V8HF, V4SF, V8HF, V8HF, UQI, INT)
@@ -1358,12 +1359,14 @@ DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF)
 DEF_FUNCTION_TYPE (V16HI, V16HF, V16HI, UHI)
 DEF_FUNCTION_TYPE (V16HF, V16HI, V16HF, UHI)
 DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, UHI)
+DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF)
 DEF_FUNCTION_TYPE (V16SI, V16HF, V16SI, UHI, INT)
 DEF_FUNCTION_TYPE (V16SF, V16HF, V16SF, UHI, INT)
 DEF_FUNCTION_TYPE (V16HF, V16HF, INT, V16HF, UHI)
 DEF_FUNCTION_TYPE (UHI, V16HF, V16HF, INT, UHI)
 DEF_FUNCTION_TYPE (V16HF, V16SI, V16HF, UHI, INT)
 DEF_FUNCTION_TYPE (V16HF, V16SF, V16HF, UHI, INT)
+DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UQI)
 DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI)
 DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI)
 DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT)
@@ -1371,7 +1374,9 @@ DEF_FUNCTION_TYPE (V32HI, V32HF, V32HI, USI, INT)
 DEF_FUNCTION_TYPE (V32HF, V32HI, V32HF, USI, INT)
 DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI)
 DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI, INT)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, INT)
 DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI)
 DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, UHI, INT)
 DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
 DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 5950d5e..302e1bc 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2911,6 +2911,26 @@ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_mask, "__builtin_ia32_vfnmsubph128_mask", IX86_BUILTIN_VFNMSUBPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_mask3, "__builtin_ia32_vfnmsubph128_mask3", IX86_BUILTIN_VFNMSUBPH128_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fnmsub_v8hf_maskz, "__builtin_ia32_vfnmsubph128_maskz", IX86_BUILTIN_VFNMSUBPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v8hf, "__builtin_ia32_vfmaddcph128", IX86_BUILTIN_VFMADDCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_mask, "__builtin_ia32_vfmaddcph128_mask", IX86_BUILTIN_VFMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddc_v8hf_maskz, "__builtin_ia32_vfmaddcph128_maskz", IX86_BUILTIN_VFMADDCPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v16hf, "__builtin_ia32_vfmaddcph256", IX86_BUILTIN_VFMADDCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_mask, "__builtin_ia32_vfmaddcph256_mask", IX86_BUILTIN_VFMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmaddc_v16hf_maskz, "__builtin_ia32_vfmaddcph256_maskz", IX86_BUILTIN_VFMADDCPH256_MASKZ, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v8hf, "__builtin_ia32_vfcmaddcph128", IX86_BUILTIN_VFCMADDCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_mask, "__builtin_ia32_vfcmaddcph128_mask", IX86_BUILTIN_VFCMADDCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddc_v8hf_maskz, "__builtin_ia32_vfcmaddcph128_maskz", IX86_BUILTIN_VFCMADDCPH128_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v16hf, "__builtin_ia32_vfcmaddcph256", IX86_BUILTIN_VFCMADDCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_mask, "__builtin_ia32_vfcmaddcph256_mask", IX86_BUILTIN_VFCMADDCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmaddc_v16hf_maskz, "__builtin_ia32_vfcmaddcph256_maskz", IX86_BUILTIN_VFCMADDCPH256_MASKZ, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulc_v8hf, "__builtin_ia32_vfcmulcph128", IX86_BUILTIN_VFCMULCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulc_v8hf_mask, "__builtin_ia32_vfcmulcph128_mask", IX86_BUILTIN_VFCMULCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmulc_v16hf, "__builtin_ia32_vfcmulcph256", IX86_BUILTIN_VFCMULCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fcmulc_v16hf_mask, "__builtin_ia32_vfcmulcph256_mask", IX86_BUILTIN_VFCMULCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulc_v8hf, "__builtin_ia32_vfmulcph128", IX86_BUILTIN_VFMULCPH_V8HF, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulc_v8hf_mask, "__builtin_ia32_vfmulcph128_mask", IX86_BUILTIN_VFMULCPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmulc_v16hf, "__builtin_ia32_vfmulcph256", IX86_BUILTIN_VFMULCPH_V16HF, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_fmulc_v16hf_mask, "__builtin_ia32_vfmulcph256_mask", IX86_BUILTIN_VFMULCPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UQI)
 
 /* Builtins with rounding support.  */
 BDESC_END (ARGS, ROUND_ARGS)
@@ -3201,6 +3221,26 @@ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_mask_round
 BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_mask3_round, "__builtin_ia32_vfnmaddsh3_mask3", IX86_BUILTIN_VFNMADDSH3_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
 BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfnmadd_v8hf_maskz_round, "__builtin_ia32_vfnmaddsh3_maskz", IX86_BUILTIN_VFNMADDSH3_MASKZ, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
 BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmfmsub_v8hf_mask3_round, "__builtin_ia32_vfmsubsh3_mask3", IX86_BUILTIN_VFMSUBSH3_MASK3, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fmaddc_v32hf_round, "__builtin_ia32_vfmaddcph512_round", IX86_BUILTIN_VFMADDCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_mask_round, "__builtin_ia32_vfmaddcph512_mask_round", IX86_BUILTIN_VFMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmaddc_v32hf_maskz_round, "__builtin_ia32_vfmaddcph512_maskz_round", IX86_BUILTIN_VFMADDCPH512_MASKZ_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_fma_fcmaddc_v32hf_round, "__builtin_ia32_vfcmaddcph512_round", IX86_BUILTIN_VFCMADDCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_mask_round, "__builtin_ia32_vfcmaddcph512_mask_round", IX86_BUILTIN_VFCMADDCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmaddc_v32hf_maskz_round, "__builtin_ia32_vfcmaddcph512_maskz_round", IX86_BUILTIN_VFCMADDCPH512_MASKZ_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmulc_v32hf_round, "__builtin_ia32_vfcmulcph512_round", IX86_BUILTIN_VFCMULCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fcmulc_v32hf_mask_round, "__builtin_ia32_vfcmulcph512_mask_round", IX86_BUILTIN_VFCMULCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmulc_v32hf_round, "__builtin_ia32_vfmulcph512_round", IX86_BUILTIN_VFMULCPH512_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_fmulc_v32hf_mask_round, "__builtin_ia32_vfmulcph512_mask_round", IX86_BUILTIN_VFMULCPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fma_fcmaddcsh_v8hf_round, "__builtin_ia32_vfcmaddcsh_round", IX86_BUILTIN_VFCMADDCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask_round, "__builtin_ia32_vfcmaddcsh_mask_round", IX86_BUILTIN_VFCMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmaddcsh_v8hf_maskz_round, "__builtin_ia32_vfcmaddcsh_maskz_round", IX86_BUILTIN_VFCMADDCSH_MASKZ_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fma_fmaddcsh_v8hf_round, "__builtin_ia32_vfmaddcsh_round", IX86_BUILTIN_VFMADDCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask_round, "__builtin_ia32_vfmaddcsh_mask_round", IX86_BUILTIN_VFMADDCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmaddcsh_v8hf_maskz_round, "__builtin_ia32_vfmaddcsh_maskz_round", IX86_BUILTIN_VFMADDCSH_MASKZ_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulcsh_v8hf_round, "__builtin_ia32_vfcmulcsh_round", IX86_BUILTIN_VFCMULCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fcmulcsh_v8hf_mask_round, "__builtin_ia32_vfcmulcsh_mask_round", IX86_BUILTIN_VFCMULCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulcsh_v8hf_round, "__builtin_ia32_vfmulcsh_round", IX86_BUILTIN_VFMULCSH_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fmulcsh_v8hf_mask_round, "__builtin_ia32_vfmulcsh_mask_round", IX86_BUILTIN_VFMULCSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
 
 BDESC_END (ROUND_ARGS, MULTI_ARG)
 
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index c88cb14..4780b99 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -3638,6 +3638,8 @@ ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
     return false;
   else if (vector_size == 64)
     return true;
+  else if (GET_MODE_INNER (cmp_mode) == HFmode)
+    return true;
 
   /* When op_true is NULL, op_false must be NULL, or vice versa.  */
   gcc_assert (!op_true == !op_false);
@@ -9762,6 +9764,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V2DI_FTYPE_V8HF_V2DI_UQI:
     case V2DI_FTYPE_V4SF_V2DI_UQI:
     case V8HF_FTYPE_V8HF_V8HF_UQI:
+    case V8HF_FTYPE_V8HF_V8HF_V8HF:
     case V8HF_FTYPE_V8HI_V8HF_UQI:
     case V8HF_FTYPE_V8SI_V8HF_UQI:
     case V8HF_FTYPE_V8SF_V8HF_UQI:
@@ -9840,6 +9843,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16SF_FTYPE_V8SF_V16SF_UHI:
     case V16SI_FTYPE_V8SI_V16SI_UHI:
     case V16HF_FTYPE_V16HI_V16HF_UHI:
+    case V16HF_FTYPE_V16HF_V16HF_V16HF:
     case V16HI_FTYPE_V16HF_V16HI_UHI:
     case V16HI_FTYPE_V16HI_V16HI_UHI:
     case V8HI_FTYPE_V16QI_V8HI_UQI:
@@ -9996,6 +10000,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
+    case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
     case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
@@ -10725,6 +10730,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
     case V32HF_FTYPE_V32HI_V32HF_USI_INT:
     case V32HF_FTYPE_V32HF_V32HF_USI_INT:
+    case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
@@ -10754,6 +10760,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
     case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
+    case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
     case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
@@ -16038,6 +16045,7 @@ emit_reduc_half (rtx dest, rtx src, int i)
       break;
     case E_V16QImode:
     case E_V8HImode:
+    case E_V8HFmode:
     case E_V4SImode:
     case E_V2DImode:
       d = gen_reg_rtx (V1TImode);
@@ -16059,6 +16067,7 @@ emit_reduc_half (rtx dest, rtx src, int i)
       break;
     case E_V32QImode:
     case E_V16HImode:
+    case E_V16HFmode:
     case E_V8SImode:
     case E_V4DImode:
       if (i == 256)
@@ -16078,6 +16087,7 @@ emit_reduc_half (rtx dest, rtx src, int i)
       break;
     case E_V64QImode:
     case E_V32HImode:
+    case E_V32HFmode:
       if (i < 64)
 	{
 	  d = gen_reg_rtx (V4TImode);
diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 14f816f..43bb676 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -2258,15 +2258,22 @@ remove_partial_avx_dependency (void)
 
 	  rtx zero;
 	  machine_mode dest_vecmode;
-	  if (dest_mode == E_SFmode)
+	  switch (dest_mode)
 	    {
+	    case E_HFmode:
+	      dest_vecmode = V8HFmode;
+	      zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
+	      break;
+	    case E_SFmode:
 	      dest_vecmode = V4SFmode;
 	      zero = v4sf_const0;
-	    }
-	  else
-	    {
+	      break;
+	    case E_DFmode:
 	      dest_vecmode = V2DFmode;
 	      zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
+	      break;
+	    default:
+	      gcc_unreachable ();
 	    }
 
 	  /* Change source to vector mode.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ba89e11..a566d84 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2462,6 +2462,8 @@ classify_argument (machine_mode mode, const_tree type,
     case E_V2SFmode:
     case E_V2SImode:
     case E_V4HImode:
+    case E_V4HFmode:
+    case E_V2HFmode:
     case E_V8QImode:
       classes[0] = X86_64_SSE_CLASS;
       return 1;
@@ -2902,6 +2904,7 @@ pass_in_reg:
 
     case E_V8QImode:
     case E_V4HImode:
+    case E_V4HFmode:
     case E_V2SImode:
     case E_V2SFmode:
     case E_V1TImode:
@@ -3149,6 +3152,7 @@ pass_in_reg:
 
     case E_V8QImode:
     case E_V4HImode:
+    case E_V4HFmode:
     case E_V2SImode:
     case E_V2SFmode:
     case E_V1TImode:
@@ -5035,7 +5039,8 @@ standard_80387_constant_p (rtx x)
   /* For XFmode constants, try to find a special 80387 instruction when
      optimizing for size or on those CPUs that benefit from them.  */
   if (mode == XFmode
-      && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
+      && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS)
+      && !flag_rounding_math)
     {
       int i;
 
@@ -10703,24 +10708,19 @@ legitimate_pic_address_disp_p (rtx disp)
 	      if (is_imported_p (op0))
 		return true;
 
-	      if (SYMBOL_REF_FAR_ADDR_P (op0)
-		  || !SYMBOL_REF_LOCAL_P (op0))
+	      if (SYMBOL_REF_FAR_ADDR_P (op0) || !SYMBOL_REF_LOCAL_P (op0))
 		break;
 
-	      /* Function-symbols need to be resolved only for
-	         large-model.
-	         For the small-model we don't need to resolve anything
-	         here.  */
+	      /* Non-external-weak function symbols need to be resolved only
+		 for the large model.  Non-external symbols don't need to be
+		 resolved for large and medium models.  For the small model,
+		 we don't need to resolve anything here.  */
 	      if ((ix86_cmodel != CM_LARGE_PIC
-	           && SYMBOL_REF_FUNCTION_P (op0))
+		   && SYMBOL_REF_FUNCTION_P (op0)
+		   && !(SYMBOL_REF_EXTERNAL_P (op0) && SYMBOL_REF_WEAK (op0)))
+		  || !SYMBOL_REF_EXTERNAL_P (op0)
 		  || ix86_cmodel == CM_SMALL_PIC)
 		return true;
-	      /* Non-external symbols don't need to be resolved for
-	         large, and medium-model.  */
-	      if ((ix86_cmodel == CM_LARGE_PIC
-		   || ix86_cmodel == CM_MEDIUM_PIC)
-		  && !SYMBOL_REF_EXTERNAL_P (op0))
-		return true;
 	    }
 	  else if (!SYMBOL_REF_FAR_ADDR_P (op0)
 		   && (SYMBOL_REF_LOCAL_P (op0)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8a4251b..cba6d83 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1033,7 +1033,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == TImode)
 
 #define VALID_AVX512FP16_REG_MODE(MODE)					\
-  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode)
+  ((MODE) == V8HFmode || (MODE) == V16HFmode || (MODE) == V32HFmode	\
+   || (MODE) == V2HFmode)
 
 #define VALID_SSE2_REG_MODE(MODE)					\
   ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode	\
@@ -1041,7 +1042,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V2DImode || (MODE) == DFmode || (MODE) == HFmode)
 
 #define VALID_SSE2_REG_VHF_MODE(MODE)			\
-  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode)
+  (VALID_SSE2_REG_MODE (MODE) || (MODE) == V8HFmode	\
+   || (MODE) == V4HFmode || (MODE) == V2HFmode)
 
 #define VALID_SSE_REG_MODE(MODE)					\
   ((MODE) == V1TImode || (MODE) == TImode				\
@@ -1051,10 +1053,12 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define VALID_MMX_REG_MODE_3DNOW(MODE) \
   ((MODE) == V2SFmode || (MODE) == SFmode)
 
+/* To match ia32 psABI, V4HFmode should be added here.  */
 #define VALID_MMX_REG_MODE(MODE)					\
   ((MODE) == V1DImode || (MODE) == DImode				\
    || (MODE) == V2SImode || (MODE) == SImode				\
-   || (MODE) == V4HImode || (MODE) == V8QImode)
+   || (MODE) == V4HImode || (MODE) == V8QImode				\
+   || (MODE) == V4HFmode)
 
 #define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
 
@@ -1087,7 +1091,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
    || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode	\
    || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode	\
    || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode	\
-   || (MODE) == V16SFmode || VALID_AVX512FP16_REG_MODE (MODE))
+   || (MODE) == V16SFmode || (MODE) == V32HFmode || (MODE) == V16HFmode \
+   || (MODE) == V8HFmode)
 
 #define X87_FLOAT_MODE_P(MODE)	\
   (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 188f431..04cb3bf 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -498,7 +498,7 @@
 ;; Main data type used by the insn
 (define_attr "mode"
   "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
-   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
+   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V2HF"
   (const_string "unknown"))
 
 ;; The CPU unit operations uses.
@@ -832,7 +832,7 @@
 		    x64_avx,x64_avx512bw,x64_avx512dq,
 		    sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx,
 		    avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,
-		    avx512bw,noavx512bw,avx512dq,noavx512dq,
+		    avx512bw,noavx512bw,avx512dq,noavx512dq,fma_or_avx512vl,
 		    avx512vl,noavx512vl,avxvnni,avx512vnnivl,avx512fp16"
   (const_string "base"))
 
@@ -874,6 +874,8 @@
 	 (eq_attr "isa" "bmi2") (symbol_ref "TARGET_BMI2")
 	 (eq_attr "isa" "fma4") (symbol_ref "TARGET_FMA4")
 	 (eq_attr "isa" "fma") (symbol_ref "TARGET_FMA")
+	 (eq_attr "isa" "fma_or_avx512vl")
+	   (symbol_ref "TARGET_FMA || TARGET_AVX512VL")
 	 (eq_attr "isa" "avx512f") (symbol_ref "TARGET_AVX512F")
 	 (eq_attr "isa" "noavx512f") (symbol_ref "!TARGET_AVX512F")
 	 (eq_attr "isa" "avx512bw") (symbol_ref "TARGET_AVX512BW")
@@ -1104,7 +1106,8 @@
 			     (V1TI "16") (V2TI "32") (V4TI "64")
 			     (V2DF "16") (V4DF "32") (V8DF "64")
 			     (V4SF "16") (V8SF "32") (V16SF "64")
-			     (V8HF "16") (V16HF "32") (V32HF "64")])
+			     (V8HF "16") (V16HF "32") (V32HF "64")
+			     (V4HF "8") (V2HF "4")])
 
 ;; Double word integer modes as mode attribute.
 (define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI") (TI "OI")])
@@ -1541,6 +1544,21 @@
   DONE;
 })
 
+(define_expand "cstorehf4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:HF 2 "cmp_fp_expander_operand")
+		    (match_operand:HF 3 "cmp_fp_expander_operand")))
+   (set (match_operand:QI 0 "register_operand")
+	(match_operator 1 "ix86_fp_comparison_operator"
+	  [(reg:CC FLAGS_REG)
+	   (const_int 0)]))]
+  "TARGET_AVX512FP16"
+{
+  ix86_expand_setcc (operands[0], GET_CODE (operands[1]),
+		     operands[2], operands[3]);
+  DONE;
+})
+
 (define_expand "cstore<mode>4"
   [(set (reg:CC FLAGS_REG)
 	(compare:CC (match_operand:MODEF 2 "cmp_fp_expander_operand")
@@ -4793,6 +4811,16 @@
    }
 })
 
+(define_insn "fix<fixunssuffix>_trunchf<mode>2"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(any_fix:SWI48
+	  (match_operand:HF 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512FP16"
+  "vcvttsh2<fixsuffix>si\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
 ;; Signed conversion to SImode.
 
 (define_expand "fix_truncxfsi2"
@@ -4900,6 +4928,17 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "SI")])
 
+(define_insn "*fixuns_trunchfsi2zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (unsigned_fix:SI
+	    (match_operand:HF 1 "nonimmediate_operand" "vm"))))]
+  "TARGET_64BIT && TARGET_AVX512FP16"
+  "vcvttsh2usi\t{%1, %k0|%k0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "SI")])
+
 (define_insn "*fixuns_trunc<mode>si2_avx512f_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
@@ -4932,6 +4971,14 @@
 ;; Without these patterns, we'll try the unsigned SI conversion which
 ;; is complex for SSE, rather than the signed SI conversion, which isn't.
 
+(define_expand "fixuns_trunchfhi2"
+  [(set (match_dup 2)
+	(fix:SI (match_operand:HF 1 "nonimmediate_operand")))
+   (set (match_operand:HI 0 "nonimmediate_operand")
+	(subreg:HI (match_dup 2) 0))]
+  "TARGET_AVX512FP16"
+  "operands[2] = gen_reg_rtx (SImode);")
+
 (define_expand "fixuns_trunc<mode>hi2"
   [(set (match_dup 2)
 	(fix:SI (match_operand:MODEF 1 "nonimmediate_operand")))
@@ -10163,6 +10210,40 @@
   [(set_attr "type" "alu")
    (set_attr "mode" "<MODE>")])
 
+;; convert (sign_extend:WIDE (any_logic:NARROW (memory, immediate)))
+;; to (any_logic:WIDE (sign_extend (memory)), (sign_extend (immediate))).
+;; This eliminates sign extension after logic operation.
+
+(define_split
+  [(set (match_operand:SWI248 0 "register_operand")
+	(sign_extend:SWI248
+	  (any_logic:QI (match_operand:QI 1 "memory_operand")
+			(match_operand:QI 2 "const_int_operand"))))]
+  ""
+  [(set (match_dup 3) (sign_extend:SWI248 (match_dup 1)))
+   (set (match_dup 0) (any_logic:SWI248 (match_dup 3) (match_dup 2)))]
+  "operands[3] = gen_reg_rtx (<MODE>mode);")
+
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+	(sign_extend:SWI48
+	  (any_logic:HI (match_operand:HI 1 "memory_operand")
+			(match_operand:HI 2 "const_int_operand"))))]
+  ""
+  [(set (match_dup 3) (sign_extend:SWI48 (match_dup 1)))
+   (set (match_dup 0) (any_logic:SWI48 (match_dup 3) (match_dup 2)))]
+  "operands[3] = gen_reg_rtx (<MODE>mode);")
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(sign_extend:DI
+	  (any_logic:SI (match_operand:SI 1 "memory_operand")
+			(match_operand:SI 2 "const_int_operand"))))]
+  "TARGET_64BIT"
+  [(set (match_dup 3) (sign_extend:DI (match_dup 1)))
+   (set (match_dup 0) (any_logic:DI (match_dup 3) (match_dup 2)))]
+  "operands[3] = gen_reg_rtx (DImode);")
+
 (define_insn "*<code><mode>_2"
   [(set (reg FLAGS_REG)
 	(compare (any_or:SWI
@@ -17041,6 +17122,19 @@
   DONE;
 })
 
+(define_insn "sqrthf2"
+  [(set (match_operand:HF 0 "register_operand" "=v,v")
+	(sqrt:HF
+	  (match_operand:HF 1 "nonimmediate_operand" "v,m")))]
+  "TARGET_AVX512FP16"
+  "@
+   vsqrtsh\t{%d1, %0|%0, %d1}
+   vsqrtsh\t{%1, %d0|%d0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "avx_partial_xmm_update" "false,true")
+   (set_attr "mode" "HF")])
+
 (define_insn "*sqrt<mode>2_sse"
   [(set (match_operand:MODEF 0 "register_operand" "=v,v,v")
 	(sqrt:MODEF
@@ -18220,9 +18314,9 @@
 
 
 (define_insn "sse4_1_round<mode>2"
-  [(set (match_operand:MODEF 0 "register_operand" "=x,x,x,v,v")
-	(unspec:MODEF
-	  [(match_operand:MODEF 1 "nonimmediate_operand" "0,x,m,v,m")
+  [(set (match_operand:MODEFH 0 "register_operand" "=x,x,x,v,v")
+	(unspec:MODEFH
+	  [(match_operand:MODEFH 1 "nonimmediate_operand" "0,x,m,v,m")
 	   (match_operand:SI 2 "const_0_to_15_operand" "n,n,n,n,n")]
 	  UNSPEC_ROUND))]
   "TARGET_SSE4_1"
@@ -18257,6 +18351,17 @@
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "XF")])
 
+(define_expand "rinthf2"
+  [(match_operand:HF 0 "register_operand")
+   (match_operand:HF 1 "nonimmediate_operand")]
+  "TARGET_AVX512FP16"
+{
+  emit_insn (gen_sse4_1_roundhf2 (operands[0],
+				  operands[1],
+				  GEN_INT (ROUND_MXCSR)));
+  DONE;
+})
+
 (define_expand "rint<mode>2"
   [(use (match_operand:MODEF 0 "register_operand"))
    (use (match_operand:MODEF 1 "nonimmediate_operand"))]
@@ -18290,6 +18395,17 @@
   "TARGET_USE_FANCY_MATH_387
    && !flag_trapping_math")
 
+(define_expand "nearbyinthf2"
+  [(match_operand:HF 0 "register_operand")
+   (match_operand:HF 1 "nonimmediate_operand")]
+  "TARGET_AVX512FP16"
+{
+  emit_insn (gen_sse4_1_roundhf2 (operands[0],
+				  operands[1],
+				  GEN_INT (ROUND_MXCSR | ROUND_NO_EXC)));
+  DONE;
+})
+
 (define_expand "nearbyint<mode>2"
   [(use (match_operand:MODEF 0 "register_operand"))
    (use (match_operand:MODEF 1 "nonimmediate_operand"))]
@@ -18479,6 +18595,18 @@
   "TARGET_USE_FANCY_MATH_387
    && (flag_fp_int_builtin_inexact || !flag_trapping_math)")
 
+(define_expand "<rounding_insn>hf2"
+  [(parallel [(set (match_operand:HF 0 "register_operand")
+		   (unspec:HF [(match_operand:HF 1 "register_operand")]
+		     FRNDINT_ROUNDING))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_AVX512FP16"
+{
+  emit_insn (gen_sse4_1_roundhf2 (operands[0], operands[1],
+				  GEN_INT (ROUND_<ROUNDING> | ROUND_NO_EXC)));
+  DONE;
+})
+
 (define_expand "<rounding_insn><mode>2"
   [(parallel [(set (match_operand:MODEF 0 "register_operand")
 		   (unspec:MODEF [(match_operand:MODEF 1 "register_operand")]
@@ -19882,6 +20010,17 @@
    (set_attr "type" "sseadd")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "<code>hf3"
+  [(set (match_operand:HF 0 "register_operand" "=v")
+	(smaxmin:HF
+	  (match_operand:HF 1 "nonimmediate_operand" "%v")
+	  (match_operand:HF 2 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512FP16"
+  "v<maxmin_float>sh\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "type" "sseadd")
+   (set_attr "mode" "HF")])
+
 ;; These versions of the min/max patterns implement exactly the operations
 ;;   min = (op1 < op2 ? op1 : op2)
 ;;   max = (!(op1 < op2) ? op1 : op2)
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 2d3b63f..c9467bc 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -48,7 +48,7 @@
 (define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI (V1DI "TARGET_SSE2")])
 
 ;; All 8-byte vector modes handled by MMX
-(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
+(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF V4HF])
 (define_mode_iterator MMXMODE124 [V8QI V4HI V2SI V2SF])
 
 ;; Mix-n-match
@@ -57,8 +57,8 @@
 (define_mode_iterator MMXMODE24 [V4HI V2SI])
 (define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
 
-;; All 4-byte integer vector modes
-(define_mode_iterator V_32 [V4QI V2HI V1SI])
+;; All 4-byte integer/float16 vector modes
+(define_mode_iterator V_32 [V4QI V2HI V1SI V2HF])
 
 ;; 4-byte integer vector modes
 (define_mode_iterator VI_32 [V4QI V2HI])
@@ -66,6 +66,9 @@
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
+;; 4-byte and 8-byte float16 vector modes
+(define_mode_iterator VHF_32_64 [V4HF V2HF])
+
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
@@ -191,6 +194,8 @@
 	    (eq_attr "alternative" "11,12")
 	      (cond [(match_test "<MODE>mode == V2SFmode")
 		       (const_string "V4SF")
+		     (match_test "<MODE>mode == V4HFmode")
+		       (const_string "V4SF")
 		     (ior (not (match_test "TARGET_SSE2"))
 			  (match_test "optimize_function_for_size_p (cfun)"))
 		       (const_string "V4SF")
@@ -198,14 +203,16 @@
 		    (const_string "TI"))
 
 	    (and (eq_attr "alternative" "13")
-		 (ior (and (match_test "<MODE>mode == V2SFmode")
-			   (not (match_test "TARGET_MMX_WITH_SSE")))
-		      (not (match_test "TARGET_SSE2"))))
+		 (ior (ior (and (match_test "<MODE>mode == V2SFmode")
+				(not (match_test "TARGET_MMX_WITH_SSE")))
+			   (not (match_test "TARGET_SSE2")))
+		      (match_test "<MODE>mode == V4HFmode")))
 	      (const_string "V2SF")
 
 	    (and (eq_attr "alternative" "14")
-	    	 (ior (match_test "<MODE>mode == V2SFmode")
-		      (not (match_test "TARGET_SSE2"))))
+		 (ior (ior (match_test "<MODE>mode == V2SFmode")
+			   (not (match_test "TARGET_SSE2")))
+		      (match_test "<MODE>mode == V4HFmode")))
 	      (const_string "V2SF")
 	   ]
 	   (const_string "DI")))
@@ -289,12 +296,17 @@
        (const_string "*")))
    (set (attr "mode")
      (cond [(eq_attr "alternative" "2,3")
-	      (cond [(match_test "TARGET_AVX")
+	      (cond [(match_test "<MODE>mode == V2HFmode")
+		       (const_string "V4SF")
+		     (match_test "TARGET_AVX")
 		       (const_string "TI")
 		     (match_test "optimize_function_for_size_p (cfun)")
 		       (const_string "V4SF")
 		    ]
 		    (const_string "TI"))
+	    (and (eq_attr "alternative" "4,5")
+		 (match_test "<MODE>mode == V2HFmode"))
+	      (const_string "SF")
 	   ]
 	   (const_string "SI")))
    (set (attr "preferred_for_speed")
@@ -1019,12 +1031,13 @@
 	  (match_operand:V2SF 1 "register_operand" "%0,v,x")
 	  (match_operand:V2SF 2 "register_operand" "v,v,x")
 	  (match_operand:V2SF 3 "register_operand" "v,0,x")))]
-  "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE"
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+   && TARGET_MMX_WITH_SSE"
   "@
    vfmadd132ps\t{%2, %3, %0|%0, %3, %2}
    vfmadd231ps\t{%2, %1, %0|%0, %1, %2}
    vfmaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "isa" "fma,fma,fma4")
+  [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4")
    (set_attr "type" "ssemuladd")
    (set_attr "mode" "V4SF")])
 
@@ -1035,12 +1048,13 @@
 	  (match_operand:V2SF   2 "register_operand" "v,v,x")
 	  (neg:V2SF
 	    (match_operand:V2SF 3 "register_operand" "v,0,x"))))]
-  "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE"
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+   && TARGET_MMX_WITH_SSE"
   "@
    vfmsub132ps\t{%2, %3, %0|%0, %3, %2}
    vfmsub231ps\t{%2, %1, %0|%0, %1, %2}
    vfmsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "isa" "fma,fma,fma4")
+  [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4")
    (set_attr "type" "ssemuladd")
    (set_attr "mode" "V4SF")])
 
@@ -1051,12 +1065,13 @@
 	    (match_operand:V2SF 1 "register_operand" "%0,v,x"))
 	  (match_operand:V2SF   2 "register_operand" "v,v,x")
 	  (match_operand:V2SF   3 "register_operand" "v,0,x")))]
-  "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE"
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+   && TARGET_MMX_WITH_SSE"
   "@
    vfnmadd132ps\t{%2, %3, %0|%0, %3, %2}
    vfnmadd231ps\t{%2, %1, %0|%0, %1, %2}
    vfnmaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "isa" "fma,fma,fma4")
+  [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4")
    (set_attr "type" "ssemuladd")
    (set_attr "mode" "V4SF")])
 
@@ -1068,12 +1083,13 @@
 	  (match_operand:V2SF   2 "register_operand" "v,v,x")
 	  (neg:V2SF
 	    (match_operand:V2SF 3 "register_operand" "v,0,x"))))]
-  "(TARGET_FMA || TARGET_FMA4) && TARGET_MMX_WITH_SSE"
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+   && TARGET_MMX_WITH_SSE"
   "@
    vfnmsub132ps\t{%2, %3, %0|%0, %3, %2}
    vfnmsub231ps\t{%2, %1, %0|%0, %1, %2}
    vfnmsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "isa" "fma,fma,fma4")
+  [(set_attr "isa" "fma_or_avx512vl,fma_or_avx512vl,fma4")
    (set_attr "type" "ssemuladd")
    (set_attr "mode" "V4SF")])
 
@@ -1389,6 +1405,28 @@
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
+;; Parallel half-precision floating point arithmetic
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "<insn><mode>3"
+  [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
+	(plusminusmultdiv:VHF_32_64
+	  (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
+	  (match_operand:VHF_32_64 2 "register_operand" "v")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
+  [(set (attr "type")
+      (cond [(match_test "<CODE> == MULT")
+		(const_string "ssemul")
+	     (match_test "<CODE> == DIV")
+		(const_string "ssediv")]
+	     (const_string "sseadd")))
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8HF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
 ;; Parallel integral arithmetic
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0016c02..4559b0c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -191,6 +191,14 @@
   UNSPEC_VCVTNE2PS2BF16
   UNSPEC_VCVTNEPS2BF16
   UNSPEC_VDPBF16PS
+
+  ;; For AVX512FP16 suppport
+  UNSPEC_COMPLEX_FMA
+  UNSPEC_COMPLEX_FCMA
+  UNSPEC_COMPLEX_FMUL
+  UNSPEC_COMPLEX_FCMUL
+  UNSPEC_COMPLEX_MASK
+
 ])
 
 (define_c_enum "unspecv" [
@@ -939,6 +947,10 @@
    (V16SF "HI") (V8SF  "QI") (V4SF  "QI")
    (V8DF  "QI") (V4DF  "QI") (V2DF  "QI")])
 
+;; Mapping of vector modes to corresponding complex mask size
+(define_mode_attr avx512fmaskcmode
+  [(V32HF "HI") (V16HF "QI") (V8HF  "QI")])
+
 ;; Mapping of vector modes to corresponding mask size
 (define_mode_attr avx512fmaskmodelower
   [(V64QI "di") (V32QI "si") (V16QI "hi")
@@ -977,9 +989,9 @@
    (V16HF "OI") (V8HF "TI")])
 
 (define_mode_attr sseintvecmodelower
-  [(V16SF "v16si") (V8DF "v8di")
-   (V8SF "v8si") (V4DF "v4di")
-   (V4SF "v4si") (V2DF "v2di")
+  [(V32HF "v32hi") (V16SF "v16si") (V8DF "v8di")
+   (V16HF "v16hi") (V8SF "v8si") (V4DF "v4di")
+   (V8HF "v8hi") (V4SF "v4si") (V2DF "v2di")
    (V8SI "v8si") (V4DI "v4di")
    (V4SI "v4si") (V2DI "v2di")
    (V16HI "v16hi") (V8HI "v8hi")
@@ -1022,6 +1034,13 @@
    (V8DI "V8HF") (V4DI "V8HF") (V2DI "V8HF")
    (V8DF "V8HF") (V16SF "V16HF") (V8SF "V8HF")])
 
+;; Mapping of vector modes to vector hf modes of same element.
+(define_mode_attr ssePHmodelower
+  [(V32HI "v32hf") (V16HI "v16hf") (V8HI "v8hf")
+   (V16SI "v16hf") (V8SI "v8hf") (V4SI "v4hf")
+   (V8DI "v8hf") (V4DI "v4hf") (V2DI "v2hf")
+   (V8DF "v8hf") (V16SF "v16hf") (V8SF "v8hf")])
+
 ;; Mapping of vector modes to packed single mode of the same size
 (define_mode_attr ssePSmode
   [(V16SI "V16SF") (V8DF "V16SF")
@@ -1549,9 +1568,9 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_store<mode>_mask"
-  [(set (match_operand:VI12_AVX512VL 0 "memory_operand" "=m")
-	(vec_merge:VI12_AVX512VL
-	  (match_operand:VI12_AVX512VL 1 "register_operand" "v")
+  [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m")
+	(vec_merge:VI12HF_AVX512VL
+	  (match_operand:VI12HF_AVX512VL 1 "register_operand" "v")
 	  (match_dup 0)
 	  (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))]
   "TARGET_AVX512BW"
@@ -2106,12 +2125,12 @@
   [(set_attr "isa" "noavx,noavx,avx,avx")])
 
 (define_expand "cond_<insn><mode>"
-  [(set (match_operand:VF 0 "register_operand")
-	(vec_merge:VF
-	  (plusminus:VF
-	    (match_operand:VF 2 "vector_operand")
-	    (match_operand:VF 3 "vector_operand"))
-	  (match_operand:VF 4 "nonimm_or_0_operand")
+  [(set (match_operand:VFH 0 "register_operand")
+	(vec_merge:VFH
+	  (plusminus:VFH
+	    (match_operand:VFH 2 "vector_operand")
+	    (match_operand:VFH 3 "vector_operand"))
+	  (match_operand:VFH 4 "nonimm_or_0_operand")
 	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
   "<MODE_SIZE> == 64 || TARGET_AVX512VL"
 {
@@ -2195,12 +2214,12 @@
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_expand "cond_mul<mode>"
-  [(set (match_operand:VF 0 "register_operand")
-	(vec_merge:VF
-	  (mult:VF
-	    (match_operand:VF 2 "vector_operand")
-	    (match_operand:VF 3 "vector_operand"))
-	  (match_operand:VF 4 "nonimm_or_0_operand")
+  [(set (match_operand:VFH 0 "register_operand")
+	(vec_merge:VFH
+	  (mult:VFH
+	    (match_operand:VFH 2 "vector_operand")
+	    (match_operand:VFH 3 "vector_operand"))
+	  (match_operand:VFH 4 "nonimm_or_0_operand")
 	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
   "<MODE_SIZE> == 64 || TARGET_AVX512VL"
 {
@@ -2310,12 +2329,12 @@
 })
 
 (define_expand "cond_div<mode>"
-  [(set (match_operand:VF 0 "register_operand")
-	(vec_merge:VF
-	  (div:VF
-	    (match_operand:VF 2 "register_operand")
-	    (match_operand:VF 3 "vector_operand"))
-	  (match_operand:VF 4 "nonimm_or_0_operand")
+  [(set (match_operand:VFH 0 "register_operand")
+	(vec_merge:VFH
+	  (div:VFH
+	    (match_operand:VFH 2 "register_operand")
+	    (match_operand:VFH 3 "vector_operand"))
+	  (match_operand:VFH 4 "nonimm_or_0_operand")
 	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
   "<MODE_SIZE> == 64 || TARGET_AVX512VL"
 {
@@ -2510,12 +2529,12 @@
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_insn "*<sse>_vmsqrt<mode>2<mask_scalar_name><round_scalar_name>"
-  [(set (match_operand:VF_128 0 "register_operand" "=x,v")
-	(vec_merge:VF_128
-	  (vec_duplicate:VF_128
+  [(set (match_operand:VFH_128 0 "register_operand" "=x,v")
+	(vec_merge:VFH_128
+	  (vec_duplicate:VFH_128
 	    (sqrt:<ssescalarmode>
 	      (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "xm,<round_scalar_constraint>")))
-	  (match_operand:VF_128 2 "register_operand" "0,v")
+	  (match_operand:VFH_128 2 "register_operand" "0,v")
 	  (const_int 1)))]
   "TARGET_SSE"
   "@
@@ -2648,12 +2667,12 @@
    (set_attr "mode" "HF")])
 
 (define_expand "cond_<code><mode>"
-  [(set (match_operand:VF 0 "register_operand")
-	(vec_merge:VF
-	  (smaxmin:VF
-	    (match_operand:VF 2 "vector_operand")
-	    (match_operand:VF 3 "vector_operand"))
-	  (match_operand:VF 4 "nonimm_or_0_operand")
+  [(set (match_operand:VFH 0 "register_operand")
+	(vec_merge:VFH
+	  (smaxmin:VFH
+	    (match_operand:VFH 2 "vector_operand")
+	    (match_operand:VFH 3 "vector_operand"))
+	  (match_operand:VFH 4 "nonimm_or_0_operand")
 	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
   "<MODE_SIZE> == 64 || TARGET_AVX512VL"
 {
@@ -3137,36 +3156,20 @@
    (set_attr "prefix_rep" "1,*")
    (set_attr "mode" "V4SF")])
 
-(define_expand "reduc_plus_scal_v4sf"
- [(plus:V4SF
-   (match_operand:SF 0 "register_operand")
-   (match_operand:V4SF 1 "register_operand"))]
- "TARGET_SSE"
-{
-  rtx vtmp = gen_reg_rtx (V4SFmode);
-  rtx stmp = gen_reg_rtx (SFmode);
-  if (TARGET_SSE3)
-    emit_insn (gen_sse3_movshdup (vtmp, operands[1]));
-  else
-    emit_insn (gen_sse_shufps (vtmp, operands[1], operands[1], GEN_INT(177)));
-
-  emit_insn (gen_addv4sf3 (operands[1], operands[1], vtmp));
-  emit_insn (gen_sse_movhlps (vtmp, vtmp, operands[1]));
-  emit_insn (gen_vec_extractv4sfsf (stmp, vtmp, const0_rtx));
-  emit_insn (gen_vec_extractv4sfsf (operands[0], operands[1], const0_rtx));
-  emit_insn (gen_addsf3 (operands[0], operands[0], stmp));
-  DONE;
-})
+(define_mode_iterator REDUC_SSE_PLUS_MODE
+ [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE")
+  (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")])
 
-(define_expand "reduc_plus_scal_v2df"
- [(plus:V2DF
-   (match_operand:DF 0 "register_operand")
-   (match_operand:V2DF 1 "register_operand"))]
- "TARGET_SSE"
+(define_expand "reduc_plus_scal_<mode>"
+ [(plus:REDUC_SSE_PLUS_MODE
+   (match_operand:<ssescalarmode> 0 "register_operand")
+   (match_operand:REDUC_SSE_PLUS_MODE 1 "register_operand"))]
+ ""
 {
-  rtx tmp = gen_reg_rtx (V2DFmode);
-  ix86_expand_reduc (gen_addv2df3, tmp, operands[1]);
-  emit_insn (gen_vec_extractv2dfdf (operands[0], tmp, const0_rtx));
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  ix86_expand_reduc (gen_add<mode>3, tmp, operands[1]);
+  emit_insn (gen_vec_extract<mode><ssescalarmodelower> (operands[0], tmp,
+                                                        const0_rtx));
   DONE;
 })
 
@@ -3192,7 +3195,9 @@
 
 (define_mode_iterator REDUC_PLUS_MODE
  [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX")
+  (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
   (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+  (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
   (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")])
 
 (define_expand "reduc_plus_scal_<mode>"
@@ -3212,7 +3217,8 @@
 
 ;; Modes handled by reduc_sm{in,ax}* patterns.
 (define_mode_iterator REDUC_SSE_SMINMAX_MODE
-  [(V4SF "TARGET_SSE") (V2DF "TARGET_SSE")
+  [(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V4SF "TARGET_SSE") (V2DF "TARGET_SSE")
    (V4SI "TARGET_SSE2") (V8HI "TARGET_SSE2") (V16QI "TARGET_SSE2")
    (V2DI "TARGET_SSE4_2")])
 
@@ -3231,9 +3237,11 @@
 
 (define_mode_iterator REDUC_SMINMAX_MODE
   [(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
    (V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
    (V64QI "TARGET_AVX512BW")
+   (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
    (V32HI "TARGET_AVX512BW") (V16SI "TARGET_AVX512F")
    (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
    (V8DF "TARGET_AVX512F")])
@@ -3791,8 +3799,8 @@
 (define_expand "vec_cmp<mode><avx512fmaskmodelower>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
 	(match_operator:<avx512fmaskmode> 1 ""
-	  [(match_operand:V48_AVX512VL 2 "register_operand")
-	   (match_operand:V48_AVX512VL 3 "nonimmediate_operand")]))]
+	  [(match_operand:V48H_AVX512VL 2 "register_operand")
+	   (match_operand:V48H_AVX512VL 3 "nonimmediate_operand")]))]
   "TARGET_AVX512F"
 {
   bool ok = ix86_expand_mask_vec_cmp (operands[0], GET_CODE (operands[1]),
@@ -3999,6 +4007,51 @@
   DONE;
 })
 
+(define_expand "vcond<mode><mode>"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+	(if_then_else:VF_AVX512FP16VL
+	  (match_operator 3 ""
+	    [(match_operand:VF_AVX512FP16VL 4 "vector_operand")
+	     (match_operand:VF_AVX512FP16VL 5 "vector_operand")])
+	  (match_operand:VF_AVX512FP16VL 1 "general_operand")
+	  (match_operand:VF_AVX512FP16VL 2 "general_operand")))]
+  "TARGET_AVX512FP16"
+{
+  bool ok = ix86_expand_fp_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcond<mode><sseintvecmodelower>"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+	(if_then_else:VF_AVX512FP16VL
+	  (match_operator 3 ""
+	    [(match_operand:<sseintvecmode> 4 "vector_operand")
+	     (match_operand:<sseintvecmode> 5 "vector_operand")])
+	  (match_operand:VF_AVX512FP16VL 1 "general_operand")
+	  (match_operand:VF_AVX512FP16VL 2 "general_operand")))]
+  "TARGET_AVX512FP16"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcond<sseintvecmodelower><mode>"
+  [(set (match_operand:<sseintvecmode> 0 "register_operand")
+	(if_then_else:<sseintvecmode>
+	  (match_operator 3 ""
+	    [(match_operand:VF_AVX512FP16VL 4 "vector_operand")
+	     (match_operand:VF_AVX512FP16VL 5 "vector_operand")])
+	  (match_operand:<sseintvecmode> 1 "general_operand")
+	  (match_operand:<sseintvecmode> 2 "general_operand")))]
+  "TARGET_AVX512FP16"
+{
+  bool ok = ix86_expand_fp_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
 (define_expand "vcond_mask_<mode><avx512fmaskmodelower>"
   [(set (match_operand:V48_AVX512VL 0 "register_operand")
 	(vec_merge:V48_AVX512VL
@@ -4008,10 +4061,10 @@
   "TARGET_AVX512F")
 
 (define_expand "vcond_mask_<mode><avx512fmaskmodelower>"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
-	(vec_merge:VI12_AVX512VL
-	  (match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
-	  (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand")
+  [(set (match_operand:VI12HF_AVX512VL 0 "register_operand")
+	(vec_merge:VI12HF_AVX512VL
+	  (match_operand:VI12HF_AVX512VL 1 "nonimmediate_operand")
+	  (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand")
 	  (match_operand:<avx512fmaskmode> 3 "register_operand")))]
   "TARGET_AVX512BW")
 
@@ -4638,7 +4691,11 @@
    (V8SF "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL")
    (V4DF "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL")
    (V16SF "TARGET_AVX512F")
-   (V8DF "TARGET_AVX512F")])
+   (V8DF "TARGET_AVX512F")
+   (HF "TARGET_AVX512FP16")
+   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V32HF "TARGET_AVX512FP16")])
 
 (define_expand "fma<mode>4"
   [(set (match_operand:FMAMODEM 0 "register_operand")
@@ -4746,14 +4803,11 @@
    (set_attr "mode" "<MODE>")])
 
 ;; Suppose AVX-512F as baseline
-(define_mode_iterator VF_SF_AVX512VL
-  [SF V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
-   DF V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
-
 (define_mode_iterator VFH_SF_AVX512VL
   [(V32HF "TARGET_AVX512FP16")
    (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
    (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (HF "TARGET_AVX512FP16")
    SF V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
    DF V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
 
@@ -4772,13 +4826,13 @@
    (set_attr "mode" "<MODE>")])
 
 (define_expand "cond_fma<mode>"
-  [(set (match_operand:VF_AVX512VL 0 "register_operand")
-	(vec_merge:VF_AVX512VL
-	  (fma:VF_AVX512VL
-	    (match_operand:VF_AVX512VL 2 "vector_operand")
-	    (match_operand:VF_AVX512VL 3 "vector_operand")
-	    (match_operand:VF_AVX512VL 4 "vector_operand"))
-	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand")
+	(vec_merge:VFH_AVX512VL
+	  (fma:VFH_AVX512VL
+	    (match_operand:VFH_AVX512VL 2 "vector_operand")
+	    (match_operand:VFH_AVX512VL 3 "vector_operand")
+	    (match_operand:VFH_AVX512VL 4 "vector_operand"))
+	  (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand")
 	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
   "TARGET_AVX512F"
 {
@@ -4872,14 +4926,14 @@
    (set_attr "mode" "<MODE>")])
 
 (define_expand "cond_fms<mode>"
-  [(set (match_operand:VF_AVX512VL 0 "register_operand")
-	(vec_merge:VF_AVX512VL
-	  (fma:VF_AVX512VL
-	    (match_operand:VF_AVX512VL 2 "vector_operand")
-	    (match_operand:VF_AVX512VL 3 "vector_operand")
-	    (neg:VF_AVX512VL
-	      (match_operand:VF_AVX512VL 4 "vector_operand")))
-	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand")
+	(vec_merge:VFH_AVX512VL
+	  (fma:VFH_AVX512VL
+	    (match_operand:VFH_AVX512VL 2 "vector_operand")
+	    (match_operand:VFH_AVX512VL 3 "vector_operand")
+	    (neg:VFH_AVX512VL
+	      (match_operand:VFH_AVX512VL 4 "vector_operand")))
+	  (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand")
 	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
   "TARGET_AVX512F"
 {
@@ -4975,14 +5029,14 @@
    (set_attr "mode" "<MODE>")])
 
 (define_expand "cond_fnma<mode>"
-  [(set (match_operand:VF_AVX512VL 0 "register_operand")
-	(vec_merge:VF_AVX512VL
-	  (fma:VF_AVX512VL
-	    (neg:VF_AVX512VL
-	      (match_operand:VF_AVX512VL 2 "vector_operand"))
-	    (match_operand:VF_AVX512VL 3 "vector_operand")
-	    (match_operand:VF_AVX512VL 4 "vector_operand"))
-	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand")
+	(vec_merge:VFH_AVX512VL
+	  (fma:VFH_AVX512VL
+	    (neg:VFH_AVX512VL
+	      (match_operand:VFH_AVX512VL 2 "vector_operand"))
+	    (match_operand:VFH_AVX512VL 3 "vector_operand")
+	    (match_operand:VFH_AVX512VL 4 "vector_operand"))
+	  (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand")
 	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
   "TARGET_AVX512F"
 {
@@ -5080,15 +5134,15 @@
    (set_attr "mode" "<MODE>")])
 
 (define_expand "cond_fnms<mode>"
-  [(set (match_operand:VF_AVX512VL 0 "register_operand")
-	(vec_merge:VF_AVX512VL
-	  (fma:VF_AVX512VL
-	    (neg:VF_AVX512VL
-	      (match_operand:VF_AVX512VL 2 "vector_operand"))
-	    (match_operand:VF_AVX512VL 3 "vector_operand")
-	    (neg:VF_AVX512VL
-	      (match_operand:VF_AVX512VL 4 "vector_operand")))
-	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand")
+	(vec_merge:VFH_AVX512VL
+	  (fma:VFH_AVX512VL
+	    (neg:VFH_AVX512VL
+	      (match_operand:VFH_AVX512VL 2 "vector_operand"))
+	    (match_operand:VFH_AVX512VL 3 "vector_operand")
+	    (neg:VFH_AVX512VL
+	      (match_operand:VFH_AVX512VL 4 "vector_operand")))
+	  (match_operand:VFH_AVX512VL 5 "nonimm_or_0_operand")
 	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
   "TARGET_AVX512F"
 {
@@ -5793,6 +5847,168 @@
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Complex type operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_int_iterator UNSPEC_COMPLEX_F_C_MA
+	[UNSPEC_COMPLEX_FMA UNSPEC_COMPLEX_FCMA])
+
+(define_int_iterator UNSPEC_COMPLEX_F_C_MUL
+	[UNSPEC_COMPLEX_FMUL UNSPEC_COMPLEX_FCMUL])
+
+(define_int_attr complexopname
+	[(UNSPEC_COMPLEX_FMA "fmaddc")
+	 (UNSPEC_COMPLEX_FCMA "fcmaddc")
+	 (UNSPEC_COMPLEX_FMUL "fmulc")
+	 (UNSPEC_COMPLEX_FCMUL "fcmulc")])
+
+(define_expand "<avx512>_fmaddc_<mode>_maskz<round_expand_name>"
+  [(match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>")
+   (match_operand:VF_AVX512FP16VL 2 "<round_expand_nimm_predicate>")
+   (match_operand:VF_AVX512FP16VL 3 "<round_expand_nimm_predicate>")
+   (match_operand:<avx512fmaskcmode> 4 "register_operand")]
+  "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+{
+  emit_insn (gen_fma_fmaddc_<mode>_maskz_1<round_expand_name> (
+    operands[0], operands[1], operands[2], operands[3],
+    CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>));
+  DONE;
+})
+
+(define_expand "<avx512>_fcmaddc_<mode>_maskz<round_expand_name>"
+  [(match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (match_operand:VF_AVX512FP16VL 1 "<round_expand_nimm_predicate>")
+   (match_operand:VF_AVX512FP16VL 2 "<round_expand_nimm_predicate>")
+   (match_operand:VF_AVX512FP16VL 3 "<round_expand_nimm_predicate>")
+   (match_operand:<avx512fmaskcmode> 4 "register_operand")]
+  "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+{
+  emit_insn (gen_fma_fcmaddc_<mode>_maskz_1<round_expand_name> (
+    operands[0], operands[1], operands[2], operands[3],
+    CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>));
+  DONE;
+})
+
+(define_insn "fma_<complexopname>_<mode><sdc_maskz_name><round_name>"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+	(unspec:VF_AVX512FP16VL
+	  [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "%v")
+	   (match_operand:VF_AVX512FP16VL 2 "<round_nimm_predicate>" "<round_constraint>")
+	   (match_operand:VF_AVX512FP16VL 3 "<round_nimm_predicate>" "0")]
+	   UNSPEC_COMPLEX_F_C_MA))]
+  "TARGET_AVX512FP16 && <sdc_mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "v<complexopname><ssemodesuffix>\t{<round_sdc_mask_op4>%2, %1, %0<sdc_mask_op4>|%0<sdc_mask_op4>, %1, %2<round_sdc_mask_op4>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<avx512>_<complexopname>_<mode>_mask<round_name>"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+	(vec_merge:VF_AVX512FP16VL
+	  (unspec:VF_AVX512FP16VL
+	    [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
+	     (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")
+	     (match_operand:VF_AVX512FP16VL 3 "register_operand" "0")]
+	     UNSPEC_COMPLEX_F_C_MA)
+	  (match_dup 1)
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:<avx512fmaskcmode> 4 "register_operand" "Yk")]
+	    UNSPEC_COMPLEX_MASK)))]
+  "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+  "v<complexopname><ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+	  (unspec:VF_AVX512FP16VL
+	    [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
+	     (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
+	     UNSPEC_COMPLEX_F_C_MUL))]
+  "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+  "v<complexopname><ssemodesuffix>\t{<round_maskc_op3>%2, %1, %0<maskc_operand3>|%0<maskc_operand3>, %1, %2<round_maskc_op3>}"
+  [(set_attr "type" "ssemul")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "avx512fp16_fmaddcsh_v8hf_maskz<round_expand_name>"
+  [(match_operand:V8HF 0 "register_operand")
+   (match_operand:V8HF 1 "<round_expand_nimm_predicate>")
+   (match_operand:V8HF 2 "<round_expand_nimm_predicate>")
+   (match_operand:V8HF 3 "<round_expand_nimm_predicate>")
+   (match_operand:QI 4 "register_operand")]
+  "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+{
+  emit_insn (gen_avx512fp16_fma_fmaddcsh_v8hf_maskz<round_expand_name> (
+    operands[0], operands[1], operands[2], operands[3],
+    CONST0_RTX (V8HFmode), operands[4]<round_expand_operand>));
+  DONE;
+})
+
+(define_expand "avx512fp16_fcmaddcsh_v8hf_maskz<round_expand_name>"
+  [(match_operand:V8HF 0 "register_operand")
+   (match_operand:V8HF 1 "<round_expand_nimm_predicate>")
+   (match_operand:V8HF 2 "<round_expand_nimm_predicate>")
+   (match_operand:V8HF 3 "<round_expand_nimm_predicate>")
+   (match_operand:QI 4 "register_operand")]
+  "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+{
+  emit_insn (gen_avx512fp16_fma_fcmaddcsh_v8hf_maskz<round_expand_name> (
+    operands[0], operands[1], operands[2], operands[3],
+    CONST0_RTX (V8HFmode), operands[4]<round_expand_operand>));
+  DONE;
+})
+
+(define_insn "avx512fp16_fma_<complexopname>sh_v8hf<mask_scalarcz_name><round_scalarcz_name>"
+  [(set (match_operand:V8HF 0 "register_operand" "=&v")
+	(vec_merge:V8HF
+	  (unspec:V8HF
+	    [(match_operand:V8HF 1 "<round_scalarcz_nimm_predicate>" "v")
+	     (match_operand:V8HF 2 "<round_scalarcz_nimm_predicate>" "<round_scalarcz_constraint>")
+	     (match_operand:V8HF 3 "<round_scalarcz_nimm_predicate>" "0")]
+	     UNSPEC_COMPLEX_F_C_MA)
+	  (match_dup 2)
+	  (const_int 3)))]
+  "TARGET_AVX512FP16"
+  "v<complexopname>sh\t{<round_scalarcz_mask_op4>%2, %1, %0<mask_scalarcz_operand4>|%0<mask_scalarcz_operand4>, %1, %2<round_scalarcz_maskcz_mask_op4>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V8HF")])
+
+(define_insn "avx512fp16_<complexopname>sh_v8hf_mask<round_name>"
+  [(set (match_operand:V8HF 0 "register_operand" "=&v")
+	(vec_merge:V8HF
+	  (vec_merge:V8HF
+	    (unspec:V8HF
+	      [(match_operand:V8HF 1 "<round_nimm_predicate>" "v")
+	       (match_operand:V8HF 2 "<round_nimm_predicate>" "<round_constraint>")
+	       (match_operand:V8HF 3 "<round_nimm_predicate>" "0")]
+	       UNSPEC_COMPLEX_F_C_MA)
+	    (match_dup 1)
+	    (unspec:QI [(match_operand:QI 4 "register_operand" "Yk")]
+	      UNSPEC_COMPLEX_MASK))
+	  (match_dup 2)
+	  (const_int 3)))]
+  "TARGET_AVX512FP16"
+  "v<complexopname>sh\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V8HF")])
+
+(define_insn "avx512fp16_<complexopname>sh_v8hf<mask_scalarc_name><round_scalarcz_name>"
+  [(set (match_operand:V8HF 0 "register_operand" "=&v")
+	  (vec_merge:V8HF
+	    (unspec:V8HF
+	      [(match_operand:V8HF 1 "nonimmediate_operand" "v")
+	       (match_operand:V8HF 2 "<round_scalarcz_nimm_predicate>" "<round_scalarcz_constraint>")]
+	       UNSPEC_COMPLEX_F_C_MUL)
+	    (match_dup 1)
+	    (const_int 3)))]
+  "TARGET_AVX512FP16"
+  "v<complexopname>sh\t{<round_scalarc_mask_op3>%2, %1, %0<mask_scalarc_operand3>|%0<mask_scalarc_operand3>, %1, %2<round_scalarc_mask_op3>}"
+  [(set_attr "type" "ssemul")
+   (set_attr "mode" "V8HF")])
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel half-precision floating point conversion operations
@@ -5824,6 +6040,12 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_expand "float<floatunssuffix><mode><ssePHmodelower>2"
+  [(set (match_operand:<ssePHmode> 0 "register_operand")
+	(any_float:<ssePHmode>
+	  (match_operand:VI2H_AVX512VL 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16")
+
 (define_insn "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode><mask_name><round_name>"
   [(set (match_operand:<ssePHmode> 0 "register_operand" "=v")
 	(any_float:<ssePHmode>
@@ -5834,11 +6056,23 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_expand "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>"
-  [(set (match_operand:V8HF 0 "register_operand" "=v")
+(define_expand "float<floatunssuffix><mode>v4hf2"
+  [(set (match_operand:V4HF 0 "register_operand")
+	(any_float:V4HF
+	  (match_operand:VI4_128_8_256 1 "vector_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  operands[0] = lowpart_subreg (V8HFmode, operands[0], V4HFmode);
+  emit_insn (gen_avx512fp16_float<floatunssuffix><mode>v4hf2 (operands[0],
+							      operands[1]));
+  DONE;
+})
+
+(define_expand "avx512fp16_float<floatunssuffix><mode>v4hf2"
+  [(set (match_operand:V8HF 0 "register_operand")
 	(vec_concat:V8HF
-	    (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
-	    (match_dup 2)))]
+	  (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand"))
+	  (match_dup 2)))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "operands[2] = CONST0_RTX (V4HFmode);")
 
@@ -5897,11 +6131,23 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_expand "avx512fp16_vcvt<floatsuffix>qq2ph_v2di"
-  [(set (match_operand:V8HF 0 "register_operand" "=v")
+(define_expand "float<floatunssuffix>v2div2hf2"
+  [(set (match_operand:V2HF 0 "register_operand")
+	(any_float:V2HF
+	  (match_operand:V2DI 1 "vector_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  operands[0] = lowpart_subreg (V8HFmode, operands[0], V2HFmode);
+  emit_insn (gen_avx512fp16_float<floatunssuffix>v2div2hf2 (operands[0],
+							    operands[1]));
+  DONE;
+})
+
+(define_expand "avx512fp16_float<floatunssuffix>v2div2hf2"
+  [(set (match_operand:V8HF 0 "register_operand")
 	(vec_concat:V8HF
-	    (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
-	    (match_dup 2)))]
+	  (any_float:V2HF (match_operand:V2DI 1 "vector_operand"))
+	  (match_dup 2)))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "operands[2] = CONST0_RTX (V6HFmode);")
 
@@ -6000,6 +6246,12 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
 
+(define_expand "fix<fixunssuffix>_trunc<ssePHmodelower><mode>2"
+  [(set (match_operand:VI2H_AVX512VL 0 "register_operand")
+	(any_fix:VI2H_AVX512VL
+	  (match_operand:<ssePHmode> 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16")
+
 (define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name><round_saeonly_name>"
   [(set (match_operand:VI2H_AVX512VL 0 "register_operand" "=v")
 	(any_fix:VI2H_AVX512VL
@@ -6010,6 +6262,21 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_expand "fix<fixunssuffix>_truncv4hf<mode>2"
+  [(set (match_operand:VI4_128_8_256 0 "register_operand")
+	(any_fix:VI4_128_8_256
+	  (match_operand:V4HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  if (!MEM_P (operands[1]))
+    {
+      operands[1] = lowpart_subreg (V8HFmode, operands[1], V4HFmode);
+      emit_insn (gen_avx512fp16_fix<fixunssuffix>_trunc<mode>2 (operands[0],
+								operands[1]));
+      DONE;
+    }
+})
+
 (define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name>"
   [(set (match_operand:VI4_128_8_256 0 "register_operand" "=v")
 	(any_fix:VI4_128_8_256
@@ -6032,6 +6299,21 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_expand "fix<fixunssuffix>_truncv2hfv2di2"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(any_fix:V2DI
+	  (match_operand:V2HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  if (!MEM_P (operands[1]))
+    {
+      operands[1] = lowpart_subreg (V8HFmode, operands[1], V2HFmode);
+      emit_insn (gen_avx512fp16_fix<fixunssuffix>_truncv2di2 (operands[0],
+							      operands[1]));
+      DONE;
+  }
+})
+
 (define_insn "avx512fp16_fix<fixunssuffix>_truncv2di2<mask_name>"
   [(set (match_operand:V2DI 0 "register_operand" "=v")
 	(any_fix:V2DI
@@ -6080,6 +6362,12 @@
   [(V16SF "x") (V8SF "x") (V4SF "x")
    (V8DF "") (V4DF "") (V2DF "")])
 
+(define_expand "extend<ssePHmodelower><mode>2"
+  [(set (match_operand:VF48H_AVX512VL 0 "register_operand")
+	(float_extend:VF48H_AVX512VL
+	  (match_operand:<ssePHmode> 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16")
+
 (define_insn "avx512fp16_float_extend_ph<mode>2<mask_name><round_saeonly_name>"
   [(set (match_operand:VF48H_AVX512VL 0 "register_operand" "=v")
 	(float_extend:VF48H_AVX512VL
@@ -6090,6 +6378,21 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_expand "extendv4hf<mode>2"
+  [(set (match_operand:VF4_128_8_256 0 "register_operand")
+	(float_extend:VF4_128_8_256
+	  (match_operand:V4HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  if (!MEM_P (operands[1]))
+    {
+      operands[1] = lowpart_subreg (V8HFmode, operands[1], V4HFmode);
+      emit_insn (gen_avx512fp16_float_extend_ph<mode>2
+		 (operands[0], operands[1]));
+      DONE;
+    }
+})
+
 (define_insn "avx512fp16_float_extend_ph<mode>2<mask_name>"
   [(set (match_operand:VF4_128_8_256 0 "register_operand" "=v")
 	(float_extend:VF4_128_8_256
@@ -6112,6 +6415,21 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_expand "extendv2hfv2df2"
+  [(set (match_operand:V2DF 0 "register_operand")
+	(float_extend:V2DF
+	  (match_operand:V2HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  if (!MEM_P (operands[1]))
+    {
+      operands[1] = lowpart_subreg (V8HFmode, operands[1], V2HFmode);
+      emit_insn (gen_avx512fp16_float_extend_phv2df2
+		 (operands[0], operands[1]));
+      DONE;
+    }
+})
+
 (define_insn "avx512fp16_float_extend_phv2df2<mask_name>"
   [(set (match_operand:V2DF 0 "register_operand" "=v")
 	(float_extend:V2DF
@@ -6134,6 +6452,12 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
+(define_expand "trunc<mode><ssePHmodelower>2"
+  [(set (match_operand:<ssePHmode> 0 "register_operand")
+	(float_truncate:<ssePHmode>
+	  (match_operand:VF48H_AVX512VL 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16")
+
 (define_insn "avx512fp16_vcvt<castmode>2ph_<mode><mask_name><round_name>"
   [(set (match_operand:<ssePHmode> 0 "register_operand" "=v")
 	(float_truncate:<ssePHmode>
@@ -6144,11 +6468,21 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_expand "avx512fp16_vcvt<castmode>2ph_<mode>"
-  [(set (match_operand:V8HF 0 "register_operand" "=v")
+(define_expand "trunc<mode>v4hf2"
+  [(set (match_operand:V4HF 0 "register_operand")
+	(float_truncate:V4HF (match_operand:VF4_128_8_256 1 "vector_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  operands[0] = lowpart_subreg (V8HFmode, operands[0], V4HFmode);
+  emit_insn (gen_avx512fp16_trunc<mode>v4hf2 (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "avx512fp16_trunc<mode>v4hf2"
+  [(set (match_operand:V8HF 0 "register_operand")
 	(vec_concat:V8HF
 	    (float_truncate:V4HF
-	      (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+	      (match_operand:VF4_128_8_256 1 "vector_operand"))
 	    (match_dup 2)))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "operands[2] = CONST0_RTX (V4HFmode);")
@@ -6213,11 +6547,20 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_expand "avx512fp16_vcvtpd2ph_v2df"
-  [(set (match_operand:V8HF 0 "register_operand" "=v")
+(define_expand "truncv2dfv2hf2"
+  [(set (match_operand:V2HF 0 "register_operand")
+	(float_truncate:V2HF (match_operand:V2DF 1 "vector_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  operands[0] = lowpart_subreg (V8HFmode, operands[0], V2HFmode);
+  emit_insn (gen_avx512fp16_truncv2dfv2hf2 (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "avx512fp16_truncv2dfv2hf2"
+  [(set (match_operand:V8HF 0 "register_operand")
 	(vec_concat:V8HF
-	  (float_truncate:V2HF
-	    (match_operand:V2DF 1 "vector_operand" "vm"))
+	  (float_truncate:V2HF (match_operand:V2DF 1 "vector_operand"))
 	  (match_dup 2)))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
   "operands[2] = CONST0_RTX (V6HFmode);")
@@ -15229,6 +15572,21 @@
   DONE;
 })
 
+(define_expand "vcondu<mode><sseintvecmodelower>"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+	(if_then_else:VF_AVX512FP16VL
+	  (match_operator 3 ""
+	    [(match_operand:<sseintvecmode> 4 "vector_operand")
+	     (match_operand:<sseintvecmode> 5 "vector_operand")])
+	  (match_operand:VF_AVX512FP16VL 1 "general_operand")
+	  (match_operand:VF_AVX512FP16VL 2 "general_operand")))]
+  "TARGET_AVX512FP16"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
 (define_expand "vcondeq<VI8F_128:mode>v2di"
   [(set (match_operand:VI8F_128 0 "register_operand")
 	(if_then_else:VI8F_128
@@ -21298,14 +21656,14 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*sse4_1_round<ssescalarmodesuffix>"
-  [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x,v")
-	(vec_merge:VF_128
-	  (vec_duplicate:VF_128
+  [(set (match_operand:VFH_128 0 "register_operand" "=Yr,*x,x,v")
+	(vec_merge:VFH_128
+	  (vec_duplicate:VFH_128
 	    (unspec:<ssescalarmode>
 	      [(match_operand:<ssescalarmode> 2 "nonimmediate_operand" "Yrm,*xm,xm,vm")
 	       (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")]
 	      UNSPEC_ROUND))
-	  (match_operand:VF_128 1 "register_operand" "0,0,x,v")
+	  (match_operand:VFH_128 1 "register_operand" "0,0,x,v")
 	  (const_int 1)))]
   "TARGET_SSE4_1"
   "@
diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
index 157d49f..11e62c6 100644
--- a/gcc/config/i386/subst.md
+++ b/gcc/config/i386/subst.md
@@ -28,6 +28,9 @@
    V16SF V8SF  V4SF
    V8DF  V4DF  V2DF])
 
+(define_mode_iterator SUBST_CV
+  [V32HF V16HF V8HF])
+
 (define_mode_iterator SUBST_S
   [QI HI SI DI])
 
@@ -42,9 +45,11 @@
    QI HI SI DI SF DF])
 
 (define_subst_attr "mask_name" "mask" "" "_mask")
+(define_subst_attr "maskc_name" "maskc" "" "_mask")
 (define_subst_attr "mask_applied" "mask" "false" "true")
 (define_subst_attr "mask_operand2" "mask" "" "%{%3%}%N2")
 (define_subst_attr "mask_operand3" "mask" "" "%{%4%}%N3")
+(define_subst_attr "maskc_operand3" "maskc" "" "%{%4%}%N3")
 (define_subst_attr "mask_operand3_1" "mask" "" "%%{%%4%%}%%N3") ;; for sprintf
 (define_subst_attr "mask_operand4" "mask" "" "%{%5%}%N4")
 (define_subst_attr "mask_operand6" "mask" "" "%{%7%}%N6")
@@ -89,6 +94,18 @@
 	  (match_dup 0)
 	  (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))])
 
+(define_subst "maskc"
+  [(set (match_operand:SUBST_CV 0)
+        (match_operand:SUBST_CV 1))]
+  "TARGET_AVX512F"
+  [(set (match_dup 0)
+        (vec_merge:SUBST_CV
+	  (match_dup 1)
+	  (match_operand:SUBST_CV 2 "nonimm_or_0_operand" "0C")
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:<avx512fmaskcmode> 3 "register_operand" "Yk")]
+	    UNSPEC_COMPLEX_MASK)))])
+
 (define_subst_attr "mask_scalar_merge_name" "mask_scalar_merge" "" "_mask")
 (define_subst_attr "mask_scalar_merge_operand3" "mask_scalar_merge" "" "%{%3%}")
 (define_subst_attr "mask_scalar_merge_operand4" "mask_scalar_merge" "" "%{%4%}")
@@ -137,12 +154,31 @@
 	    (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk"))
 	  (match_dup 2)
 	  (const_int 1)))])
+(define_subst_attr "sdc_maskz_name" "sdc" "" "_maskz_1")
+(define_subst_attr "sdc_mask_op4" "sdc" "" "%{%5%}%N4")
+(define_subst_attr "sdc_mask_op5" "sdc" "" "%{%6%}%N5")
+(define_subst_attr "sdc_mask_mode512bit_condition" "sdc" "1" "(<MODE_SIZE> == 64 || TARGET_AVX512VL)")
+
+(define_subst "sdc"
+ [(set (match_operand:SUBST_CV 0)
+       (match_operand:SUBST_CV 1))]
+ ""
+ [(set (match_dup 0)
+       (vec_merge:SUBST_CV
+	 (match_dup 1)
+	 (match_operand:SUBST_CV 2 "const0_operand" "C")
+	 (unspec:<avx512fmaskmode>
+	   [(match_operand:<avx512fmaskcmode> 3 "register_operand" "Yk")]
+	   UNSPEC_COMPLEX_MASK)))
+])
 
 (define_subst_attr "round_name" "round" "" "_round")
 (define_subst_attr "round_mask_operand2" "mask" "%R2" "%R4")
 (define_subst_attr "round_mask_operand3" "mask" "%R3" "%R5")
+(define_subst_attr "round_maskc_operand3" "maskc" "%R3" "%R5")
 (define_subst_attr "round_mask_operand4" "mask" "%R4" "%R6")
 (define_subst_attr "round_sd_mask_operand4" "sd" "%R4" "%R6")
+(define_subst_attr "round_sdc_mask_operand4" "sdc" "%R4" "%R6")
 (define_subst_attr "round_op2" "round" "" "%R2")
 (define_subst_attr "round_op3" "round" "" "%R3")
 (define_subst_attr "round_op4" "round" "" "%R4")
@@ -150,8 +186,10 @@
 (define_subst_attr "round_op6" "round" "" "%R6")
 (define_subst_attr "round_mask_op2" "round" "" "<round_mask_operand2>")
 (define_subst_attr "round_mask_op3" "round" "" "<round_mask_operand3>")
+(define_subst_attr "round_maskc_op3" "round" "" "<round_maskc_operand3>")
 (define_subst_attr "round_mask_op4" "round" "" "<round_mask_operand4>")
 (define_subst_attr "round_sd_mask_op4" "round" "" "<round_sd_mask_operand4>")
+(define_subst_attr "round_sdc_mask_op4" "round" "" "<round_sdc_mask_operand4>")
 (define_subst_attr "round_constraint" "round" "vm" "v")
 (define_subst_attr "round_qq2phsuff" "round" "<qq2phsuff>" "")
 (define_subst_attr "bcst_round_constraint" "round" "vmBr" "v")
@@ -189,6 +227,7 @@
 (define_subst_attr "round_saeonly_mask_scalar_merge_operand4" "mask_scalar_merge" "%r4" "%r5")
 (define_subst_attr "round_saeonly_maskz_scalar_operand5" "maskz_scalar" "%r5" "%r7")
 (define_subst_attr "round_saeonly_sd_mask_operand5" "sd" "%r5" "%r7")
+(define_subst_attr "round_saeonly_sdc_mask_operand5" "sdc" "%r5" "%r7")
 (define_subst_attr "round_saeonly_op2" "round_saeonly" "" "%r2")
 (define_subst_attr "round_saeonly_op3" "round_saeonly" "" "%r3")
 (define_subst_attr "round_saeonly_op4" "round_saeonly" "" "%r4")
@@ -289,8 +328,12 @@
     (match_operand:<avx512fmaskmode> 5 "register_operand")])
 
 (define_subst_attr "mask_scalar_name" "mask_scalar" "" "_mask")
+(define_subst_attr "mask_scalarcz_name" "mask_scalarcz" "" "_maskz")
+(define_subst_attr "mask_scalarc_name" "mask_scalarc" "" "_mask")
+(define_subst_attr "mask_scalarc_operand3" "mask_scalarc" "" "%{%4%}%N3")
 (define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3")
 (define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4")
+(define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4")
 
 (define_subst "mask_scalar"
   [(set (match_operand:SUBST_V 0)
@@ -308,12 +351,55 @@
 	  (match_dup 2)
 	  (const_int 1)))])
 
+(define_subst "mask_scalarcz"
+  [(set (match_operand:SUBST_CV 0)
+	(vec_merge:SUBST_CV
+	  (match_operand:SUBST_CV 1)
+	  (match_operand:SUBST_CV 2)
+	  (const_int 3)))]
+  "TARGET_AVX512F"
+  [(set (match_dup 0)
+	(vec_merge:SUBST_CV
+	  (vec_merge:SUBST_CV
+	    (match_dup 1)
+	    (match_operand:SUBST_CV 3 "const0_operand" "C")
+	    (unspec:<avx512fmaskmode>
+	      [(match_operand:<avx512fmaskcmode> 4 "register_operand" "Yk")]
+	      UNSPEC_COMPLEX_MASK))
+	  (match_dup 2)
+	  (const_int 3)))])
+
+(define_subst "mask_scalarc"
+  [(set (match_operand:SUBST_CV 0)
+	(vec_merge:SUBST_CV
+	  (match_operand:SUBST_CV 1)
+	  (match_operand:SUBST_CV 2)
+	  (const_int 3)))]
+  "TARGET_AVX512F"
+  [(set (match_dup 0)
+	(vec_merge:SUBST_CV
+	  (vec_merge:SUBST_CV
+	    (match_dup 1)
+	    (match_operand:SUBST_CV 3 "nonimm_or_0_operand" "0C")
+	    (unspec:<avx512fmaskmode>
+	      [(match_operand:<avx512fmaskcmode> 4 "register_operand" "Yk")]
+	      UNSPEC_COMPLEX_MASK))
+	  (match_dup 2)
+	  (const_int 3)))])
+
 (define_subst_attr "round_scalar_name" "round_scalar" "" "_round")
+(define_subst_attr "round_scalarcz_name" "round_scalarcz" "" "_round")
 (define_subst_attr "round_scalar_mask_operand3" "mask_scalar" "%R3" "%R5")
+(define_subst_attr "round_scalarc_mask_operand3" "mask_scalarc" "%R3" "%R5")
+(define_subst_attr "round_scalarcz_mask_operand4" "mask_scalarcz" "%R4" "%R6")
 (define_subst_attr "round_scalar_mask_op3" "round_scalar" "" "<round_scalar_mask_operand3>")
+(define_subst_attr "round_scalarc_mask_op3" "round_scalarcz" "" "<round_scalarc_mask_operand3>")
+(define_subst_attr "round_scalarcz_mask_op4" "round_scalarcz" "" "<round_scalarcz_mask_operand4>")
 (define_subst_attr "round_scalar_constraint" "round_scalar" "vm" "v")
+(define_subst_attr "round_scalarcz_constraint" "round_scalarcz" "vm" "v")
 (define_subst_attr "round_scalar_prefix" "round_scalar" "vex" "evex")
 (define_subst_attr "round_scalar_nimm_predicate" "round_scalar" "nonimmediate_operand" "register_operand")
+(define_subst_attr "round_scalarcz_nimm_predicate" "round_scalarcz" "vector_operand" "register_operand")
 
 (define_subst "round_scalar"
   [(set (match_operand:SUBST_V 0)
@@ -331,6 +417,22 @@
 	     (match_operand:SI 3 "const_4_or_8_to_11_operand")]
 		UNSPEC_EMBEDDED_ROUNDING))])
 
+(define_subst "round_scalarcz"
+  [(set (match_operand:SUBST_V 0)
+	(vec_merge:SUBST_V
+	  (match_operand:SUBST_V 1)
+	  (match_operand:SUBST_V 2)
+	  (const_int 3)))]
+  "TARGET_AVX512F"
+  [(set (match_dup 0)
+	(unspec:SUBST_V [
+	     (vec_merge:SUBST_V
+		(match_dup 1)
+		(match_dup 2)
+		(const_int 3))
+	     (match_operand:SI 3 "const_4_or_8_to_11_operand")]
+		UNSPEC_EMBEDDED_ROUNDING))])
+
 (define_subst_attr "round_saeonly_scalar_name" "round_saeonly_scalar" "" "_round")
 (define_subst_attr "round_saeonly_scalar_mask_operand3" "mask_scalar" "%r3" "%r5")
 (define_subst_attr "round_saeonly_scalar_mask_operand4" "mask_scalar" "%r4" "%r6")
diff --git a/gcc/config/lm32/uclinux-elf.h b/gcc/config/lm32/uclinux-elf.h
index 370df4c5..5b638fa 100644
--- a/gcc/config/lm32/uclinux-elf.h
+++ b/gcc/config/lm32/uclinux-elf.h
@@ -67,6 +67,7 @@
 
 #define TARGET_OS_CPP_BUILTINS() GNU_USER_TARGET_OS_CPP_BUILTINS()
 
+#undef LINK_GCC_C_SEQUENCE_SPEC
 #define LINK_GCC_C_SEQUENCE_SPEC \
   "%{static|static-pie:--start-group} %G %{!nolibc:%L} \
    %{static|static-pie:--end-group}%{!static:%{!static-pie:%G}}"
diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c
index 0614302..69ba5bd 100644
--- a/gcc/config/pa/pa.c
+++ b/gcc/config/pa/pa.c
@@ -541,6 +541,16 @@ pa_option_override (void)
       write_symbols = NO_DEBUG;
     }
 
+  if (TARGET_64BIT && TARGET_HPUX)
+    {
+      /* DWARF5 is not supported by gdb.  Don't emit DWARF5 unless
+	 specifically selected.  */
+      if (!global_options_set.x_dwarf_strict)
+	dwarf_strict = 1;
+      if (!global_options_set.x_dwarf_version)
+	dwarf_version = 4;
+    }
+
   /* We only support the "big PIC" model now.  And we always generate PIC
      code when in 64bit mode.  */
   if (flag_pic == 1 || TARGET_64BIT)
diff --git a/gcc/config/pru/constraints.md b/gcc/config/pru/constraints.md
index a31ae93..1e0e703 100644
--- a/gcc/config/pru/constraints.md
+++ b/gcc/config/pru/constraints.md
@@ -34,6 +34,7 @@
 ;; The following constraints are intended for internal use only:
 ;;  Rmd0, Rms0, Rms1: Registers for MUL instruction operands.
 ;;  Rsib: Jump address register suitable for sibling calls.
+;;  Rrio: The R30 and R31 I/O registers.
 ;;  M: -255 to 0 (for converting ADD to SUB with suitable UBYTE OP2).
 ;;  N: -32768 to 32767 (16-bit signed integer).
 ;;  O: -128 to 127 (8-bit signed integer).
@@ -57,6 +58,10 @@
   "@internal
   The multiply source 1 register.")
 
+(define_register_constraint "Rrio" "REGIO_REGS"
+  "@internal
+  The R30 and R31 I/O registers.")
+
 ;; Integer constraints.
 
 (define_constraint "I"
diff --git a/gcc/config/pru/predicates.md b/gcc/config/pru/predicates.md
index 469002f..1a4b98e 100644
--- a/gcc/config/pru/predicates.md
+++ b/gcc/config/pru/predicates.md
@@ -121,6 +121,25 @@
   return 0;
 })
 
+(define_predicate "regio_operand"
+  (match_code "subreg,reg")
+{
+  if (register_operand (op, mode))
+    {
+      int regno;
+
+      if (REG_P (op))
+	regno = REGNO (op);
+      else if (GET_CODE (op) == SUBREG && REG_P (SUBREG_REG (op)))
+	regno = REGNO (SUBREG_REG (op));
+      else
+	return 0;
+
+      return REGNO_REG_CLASS (regno) == REGIO_REGS;
+    }
+  return 0;
+})
+
 (define_predicate "reg_or_const_int_operand"
   (ior (match_operand 0 "const_int_operand")
        (match_operand 0 "register_operand")))
diff --git a/gcc/config/pru/pru-pragma.c b/gcc/config/pru/pru-pragma.c
index 01d0761..3beec23 100644
--- a/gcc/config/pru/pru-pragma.c
+++ b/gcc/config/pru/pru-pragma.c
@@ -83,4 +83,6 @@ pru_register_pragmas (void)
 {
   c_register_pragma (NULL, "ctable_entry", pru_pragma_ctable_entry);
   c_register_pragma (NULL, "CTABLE_ENTRY", pru_pragma_ctable_entry);
+
+  c_register_addr_space ("__regio_symbol", ADDR_SPACE_REGIO);
 }
diff --git a/gcc/config/pru/pru-protos.h b/gcc/config/pru/pru-protos.h
index 74129e9..031ea9e 100644
--- a/gcc/config/pru/pru-protos.h
+++ b/gcc/config/pru/pru-protos.h
@@ -62,7 +62,10 @@ extern int pru_get_ctable_exact_base_index (unsigned HOST_WIDE_INT caddr);
 extern int pru_get_ctable_base_index (unsigned HOST_WIDE_INT caddr);
 extern int pru_get_ctable_base_offset (unsigned HOST_WIDE_INT caddr);
 
+extern int pru_symref2ioregno (rtx op);
+
 extern void pru_register_abicheck_pass (void);
+
 #endif /* RTX_CODE */
 
 #ifdef TREE_CODE
diff --git a/gcc/config/pru/pru.c b/gcc/config/pru/pru.c
index 30d0da1..9f264b4 100644
--- a/gcc/config/pru/pru.c
+++ b/gcc/config/pru/pru.c
@@ -1403,11 +1403,42 @@ pru_valid_addr_expr_p (machine_mode mode, rtx base, rtx offset, bool strict_p)
     return false;
 }
 
-/* Implement TARGET_LEGITIMATE_ADDRESS_P.  */
+/* Return register number (either for r30 or r31) which maps to the
+   corresponding symbol OP's name in the __regio_symbol address namespace.
+
+   If no mapping can be established (i.e. symbol name is invalid), then
+   return -1.  */
+int pru_symref2ioregno (rtx op)
+{
+  if (!SYMBOL_REF_P (op))
+    return -1;
+
+  const char *name = XSTR (op, 0);
+  if (!strcmp (name, "__R30"))
+    return R30_REGNUM;
+  else if (!strcmp (name, "__R31"))
+    return R31_REGNUM;
+  else
+    return -1;
+}
+
+/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.  */
 static bool
-pru_legitimate_address_p (machine_mode mode,
-			    rtx operand, bool strict_p)
+pru_addr_space_legitimate_address_p (machine_mode mode, rtx operand,
+				     bool strict_p, addr_space_t as)
 {
+  if (as == ADDR_SPACE_REGIO)
+    {
+      /*  Address space constraints for __regio_symbol have been checked in
+	  TARGET_INSERT_ATTRIBUTES, and some more checks will be done
+	  during RTL expansion of "mov<mode>".  */
+      return true;
+    }
+  else if (as != ADDR_SPACE_GENERIC)
+    {
+      gcc_unreachable ();
+    }
+
   switch (GET_CODE (operand))
     {
     /* Direct.  */
@@ -2002,6 +2033,117 @@ pru_file_start (void)
      need to confuse users with this warning.  */
   fprintf (asm_out_file, "\t.set no_warn_regname_label\n");
 }
+
+/* Scan type TYP for pointer references to address space other than
+   ADDR_SPACE_GENERIC.  Return true if such reference is found.
+   Much of this code was taken from the avr port.  */
+
+static bool
+pru_nongeneric_pointer_addrspace (tree typ)
+{
+  while (ARRAY_TYPE == TREE_CODE (typ))
+    typ = TREE_TYPE (typ);
+
+  if (POINTER_TYPE_P (typ))
+    {
+      addr_space_t as;
+      tree target = TREE_TYPE (typ);
+
+      /* Pointer to function: Test the function's return type.  */
+      if (FUNCTION_TYPE == TREE_CODE (target))
+	return pru_nongeneric_pointer_addrspace (TREE_TYPE (target));
+
+      /* "Ordinary" pointers... */
+
+      while (TREE_CODE (target) == ARRAY_TYPE)
+	target = TREE_TYPE (target);
+
+      as = TYPE_ADDR_SPACE (target);
+
+      if (!ADDR_SPACE_GENERIC_P (as))
+	return true;
+
+      /* Scan pointer's target type.  */
+      return pru_nongeneric_pointer_addrspace (target);
+    }
+
+  return false;
+}
+
+/* Implement `TARGET_INSERT_ATTRIBUTES'.  For PRU it's used as a hook to
+   provide better diagnostics for some invalid usages of the __regio_symbol
+   address space.
+
+   Any escapes of the following checks are supposed to be caught
+   during the "mov<mode>" pattern expansion.  */
+
+static void
+pru_insert_attributes (tree node, tree *attributes ATTRIBUTE_UNUSED)
+{
+
+  /* Validate __regio_symbol variable declarations.  */
+  if (VAR_P (node))
+    {
+      const char *name = DECL_NAME (node)
+			  ? IDENTIFIER_POINTER (DECL_NAME (node))
+			  : "<unknown>";
+      tree typ = TREE_TYPE (node);
+      addr_space_t as = TYPE_ADDR_SPACE (typ);
+
+      if (as == ADDR_SPACE_GENERIC)
+	return;
+
+      if (AGGREGATE_TYPE_P (typ))
+	{
+	  error ("aggregate types are prohibited in "
+		 "%<__regio_symbol%> address space");
+	  /* Don't bother anymore.  Below checks would pile
+	     meaningless errors, which would confuse user.  */
+	  return;
+	}
+      if (DECL_INITIAL (node) != NULL_TREE)
+	error ("variables in %<__regio_symbol%> address space "
+	       "cannot have initial value");
+      if (DECL_REGISTER (node))
+	error ("variables in %<__regio_symbol%> address space "
+	       "cannot be declared %<register%>");
+      if (!TYPE_VOLATILE (typ))
+	error ("variables in %<__regio_symbol%> address space "
+	       "must be declared %<volatile%>");
+      if (!DECL_EXTERNAL (node))
+	error ("variables in %<__regio_symbol%> address space "
+	       "must be declared %<extern%>");
+      if (TYPE_MODE (typ) != SImode)
+	error ("only 32-bit access is supported "
+	       "for %<__regio_symbol%> address space");
+      if (strcmp (name, "__R30") != 0 && strcmp (name, "__R31") != 0)
+	error ("register name %<%s%> not recognized "
+	       "in %<__regio_symbol%> address space", name);
+    }
+
+  tree typ = NULL_TREE;
+
+  switch (TREE_CODE (node))
+    {
+    case FUNCTION_DECL:
+      typ = TREE_TYPE (TREE_TYPE (node));
+      break;
+    case TYPE_DECL:
+    case RESULT_DECL:
+    case VAR_DECL:
+    case FIELD_DECL:
+    case PARM_DECL:
+      typ = TREE_TYPE (node);
+      break;
+    case POINTER_TYPE:
+      typ = node;
+      break;
+    default:
+      break;
+    }
+  if (typ != NULL_TREE && pru_nongeneric_pointer_addrspace (typ))
+    error ("pointers to %<__regio_symbol%> address space are prohibited");
+}
 
 /* Function argument related.  */
 
@@ -2933,6 +3075,9 @@ pru_unwind_word_mode (void)
 #undef TARGET_ASM_FILE_START
 #define TARGET_ASM_FILE_START pru_file_start
 
+#undef  TARGET_INSERT_ATTRIBUTES
+#define TARGET_INSERT_ATTRIBUTES pru_insert_attributes
+
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS pru_init_builtins
 #undef TARGET_EXPAND_BUILTIN
@@ -2979,8 +3124,9 @@ pru_unwind_word_mode (void)
 #undef TARGET_MUST_PASS_IN_STACK
 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
 
-#undef TARGET_LEGITIMATE_ADDRESS_P
-#define TARGET_LEGITIMATE_ADDRESS_P pru_legitimate_address_p
+#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
+#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
+  pru_addr_space_legitimate_address_p
 
 #undef TARGET_INIT_LIBFUNCS
 #define TARGET_INIT_LIBFUNCS pru_init_libfuncs
diff --git a/gcc/config/pru/pru.h b/gcc/config/pru/pru.h
index 9b6be32..03f08b1 100644
--- a/gcc/config/pru/pru.h
+++ b/gcc/config/pru/pru.h
@@ -215,6 +215,7 @@ enum reg_class
   MULDST_REGS,
   MULSRC0_REGS,
   MULSRC1_REGS,
+  REGIO_REGS,
   GP_REGS,
   ALL_REGS,
   LIM_REG_CLASSES
@@ -229,6 +230,7 @@ enum reg_class
      "MULDST_REGS",	  \
      "MULSRC0_REGS",	  \
      "MULSRC1_REGS",	  \
+     "REGIO_REGS",	  \
      "GP_REGS",		  \
      "ALL_REGS" }
 
@@ -242,6 +244,7 @@ enum reg_class
     /* MULDST_REGS    */ { 0, 0, 0, 0x00000f00, 0},		\
     /* MULSRC0_REGS   */ { 0, 0, 0, 0x000f0000, 0},		\
     /* MULSRC1_REGS   */ { 0, 0, 0, 0x00f00000, 0},		\
+    /* REGIO_REGS     */ { 0, 0, 0, 0xff000000, 0},		\
     /* GP_REGS	      */ { ~0, ~0, ~0, ~0, 0},			\
     /* ALL_REGS	      */ { ~0,~0, ~0, ~0, ~0}			\
   }
@@ -252,6 +255,8 @@ enum reg_class
 	((REGNO) == MULDST_REGNUM ? MULDST_REGS				    \
 	 : (REGNO) == MULSRC0_REGNUM ? MULSRC0_REGS			    \
 	 : (REGNO) == MULSRC1_REGNUM ? MULSRC1_REGS			    \
+	 : (REGNO) == R30_REGNUM ? REGIO_REGS				    \
+	 : (REGNO) == R31_REGNUM ? REGIO_REGS				    \
 	 : (REGNO) >= FIRST_ARG_REGNUM					    \
 	    && (REGNO) <= LAST_ARG_REGNUM ? SIB_REGS			    \
 	 : (REGNO) == STATIC_CHAIN_REGNUM ? SIB_REGS			    \
diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md
index e6cfa8e..c0ded8e 100644
--- a/gcc/config/pru/pru.md
+++ b/gcc/config/pru/pru.md
@@ -36,6 +36,8 @@
    (MULSRC0_REGNUM		112) ; Multiply source register.
    (MULSRC1_REGNUM		116) ; Multiply source register.
    (LAST_NONIO_GP_REGNUM	119) ; Last non-I/O general purpose register.
+   (R30_REGNUM			120) ; R30 I/O register.
+   (R31_REGNUM			124) ; R31 I/O register.
    (LOOPCNTR_REGNUM		128) ; internal LOOP counter register
    (LAST_GP_REGNUM		132) ; Last general purpose register.
 
@@ -49,6 +51,13 @@
   ]
 )
 
+;; Enumerate address spaces.
+(define_constants
+  [
+   (ADDR_SPACE_REGIO		1) ; Access to R30 and R31 I/O registers.
+  ]
+)
+
 ;; Enumeration of UNSPECs.
 
 (define_c_enum "unspec" [
@@ -68,6 +77,9 @@
   UNSPECV_HALT
 
   UNSPECV_BLOCKAGE
+
+  UNSPECV_REGIO_READ
+  UNSPECV_REGIO_WRITE
 ])
 
 ; Length of an instruction (in bytes).
@@ -129,11 +141,62 @@
 	(match_operand:MOV8_16_32 1 "general_operand"))]
   ""
 {
-  /* It helps to split constant loading and memory access
-     early, so that the LDI/LDI32 instructions can be hoisted
-     outside a loop body.  */
-  if (MEM_P (operands[0]))
-    operands[1] = force_reg (<MODE>mode, operands[1]);
+  if (MEM_P (operands[0])
+      && MEM_ADDR_SPACE (operands[0]) == ADDR_SPACE_REGIO)
+
+    {
+      /* Intercept writes to the SImode register I/O "address space".  */
+      gcc_assert (<MODE>mode == SImode);
+
+      if (!SYMBOL_REF_P (XEXP (operands[0], 0)))
+	{
+	  error ("invalid access to %<__regio_symbol%> address space");
+	  FAIL;
+	}
+
+      if (!REG_P (operands[1]))
+	operands[1] = force_reg (<MODE>mode, operands[1]);
+
+      int regiono = pru_symref2ioregno (XEXP (operands[0], 0));
+      gcc_assert (regiono >= 0);
+      rtx regio = gen_rtx_REG (<MODE>mode, regiono);
+      rtx unspecv = gen_rtx_UNSPEC_VOLATILE (<MODE>mode,
+					     gen_rtvec (1, operands[1]),
+					     UNSPECV_REGIO_WRITE);
+      emit_insn (gen_rtx_SET (regio, unspecv));
+      DONE;
+    }
+  else if (MEM_P (operands[1])
+	   && MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_REGIO)
+    {
+      /* Intercept reads from the SImode register I/O "address space".  */
+      gcc_assert (<MODE>mode == SImode);
+
+      if (!SYMBOL_REF_P (XEXP (operands[1], 0)))
+	{
+	  error ("invalid access to %<__regio_symbol%> address space");
+	  FAIL;
+	}
+
+      if (MEM_P (operands[0]))
+	operands[0] = force_reg (<MODE>mode, operands[0]);
+
+      int regiono = pru_symref2ioregno (XEXP (operands[1], 0));
+      gcc_assert (regiono >= 0);
+      rtx regio = gen_rtx_REG (<MODE>mode, regiono);
+      rtx unspecv = gen_rtx_UNSPEC_VOLATILE (<MODE>mode,
+					     gen_rtvec (1, regio),
+					     UNSPECV_REGIO_READ);
+      emit_insn (gen_rtx_SET (operands[0], unspecv));
+      DONE;
+    }
+  else if (MEM_P (operands[0]))
+    {
+    /* It helps to split constant loading and memory access
+       early, so that the LDI/LDI32 instructions can be hoisted
+       outside a loop body.  */
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+    }
 })
 
 ;; Keep a single pattern for 32 bit MOV operations.  LRA requires that the
@@ -546,6 +609,35 @@
 
 (include "alu-zext.md")
 
+;; Patterns for accessing the R30/R31 I/O registers.
+
+(define_insn "*regio_readsi"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+    (unspec_volatile:SI
+      [(match_operand:SI 1 "regio_operand" "Rrio")]
+      UNSPECV_REGIO_READ))]
+  ""
+  "mov\\t%0, %1"
+  [(set_attr "type"     "alu")])
+
+(define_insn "*regio_nozext_writesi"
+  [(set (match_operand:SI 0 "regio_operand" "=Rrio")
+    (unspec_volatile:SI
+      [(match_operand:SI 1 "register_operand" "r")]
+      UNSPECV_REGIO_WRITE))]
+  ""
+  "mov\\t%0, %1"
+  [(set_attr "type"     "alu")])
+
+(define_insn "*regio_zext_write_r30<EQS0:mode>"
+  [(set (match_operand:SI 0 "regio_operand" "=Rrio")
+    (unspec_volatile:SI
+      [(zero_extend:SI (match_operand:EQS0 1 "register_operand" "r"))]
+      UNSPECV_REGIO_WRITE))]
+  ""
+  "mov\\t%0, %1"
+  [(set_attr "type"     "alu")])
+
 ;; DI logical ops could be automatically split into WORD-mode ops in
 ;; expand_binop().  But then we'll miss an opportunity to use SI mode
 ;; operations, since WORD mode for PRU is QI.
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index f88877f..98364f0 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -802,7 +802,7 @@
       rtx hp = gen_reg_rtx (<MODE>mode);
       rtx lp = gen_reg_rtx (<MODE>mode);
 
-      emit_insn (gen_mul<mode>3_highpart (hp, operands[1], operands[2]));
+      emit_insn (gen_smul<mode>3_highpart (hp, operands[1], operands[2]));
       emit_insn (gen_mul<mode>3 (operands[0], operands[1], operands[2]));
       emit_insn (gen_ashr<mode>3 (lp, operands[0],
 				  GEN_INT (BITS_PER_WORD - 1)));
@@ -899,14 +899,14 @@
   emit_insn (gen_muldi3 (low, operands[1], operands[2]));
 
   rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_<u>muldi3_highpart (high, operands[1], operands[2]));
+  emit_insn (gen_<su>muldi3_highpart (high, operands[1], operands[2]));
 
   emit_move_insn (gen_lowpart (DImode, operands[0]), low);
   emit_move_insn (gen_highpart (DImode, operands[0]), high);
   DONE;
 })
 
-(define_insn "<u>muldi3_highpart"
+(define_insn "<su>muldi3_highpart"
   [(set (match_operand:DI                0 "register_operand" "=r")
 	(truncate:DI
 	  (lshiftrt:TI
@@ -961,13 +961,13 @@
 {
   rtx temp = gen_reg_rtx (SImode);
   emit_insn (gen_mulsi3 (temp, operands[1], operands[2]));
-  emit_insn (gen_<u>mulsi3_highpart (riscv_subword (operands[0], true),
+  emit_insn (gen_<su>mulsi3_highpart (riscv_subword (operands[0], true),
 				     operands[1], operands[2]));
   emit_insn (gen_movsi (riscv_subword (operands[0], false), temp));
   DONE;
 })
 
-(define_insn "<u>mulsi3_highpart"
+(define_insn "<su>mulsi3_highpart"
   [(set (match_operand:SI                0 "register_operand" "=r")
 	(truncate:SI
 	  (lshiftrt:DI
diff --git a/gcc/config/rs6000/darwin.h b/gcc/config/rs6000/darwin.h
index 6abf8e8..120b01f 100644
--- a/gcc/config/rs6000/darwin.h
+++ b/gcc/config/rs6000/darwin.h
@@ -203,7 +203,7 @@
 
 /* Make both r2 and r13 available for allocation.  */
 #define FIXED_R2 0
-#define FIXED_R13 0
+#define FIXED_R13 TARGET_64BIT
 
 /* Base register for access to local variables of the function.  */
 
@@ -213,6 +213,9 @@
 #undef  RS6000_PIC_OFFSET_TABLE_REGNUM
 #define RS6000_PIC_OFFSET_TABLE_REGNUM 31
 
+#undef FIRST_SAVED_GP_REGNO
+#define FIRST_SAVED_GP_REGNO 13
+
 /* Darwin's stack must remain 16-byte aligned for both 32 and 64 bit
    ABIs.  */
 
diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index 7d48548..2eceb2c7 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -6223,11 +6223,19 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
    or vector type.  If a non-floating point or vector type is found, or
    if a floating point or vector type that doesn't match a non-VOIDmode
    *MODEP is found, then return -1, otherwise return the count in the
-   sub-tree.  */
+   sub-tree.
+
+   There have been some ABI snafus along the way with C++.  Modify
+   EMPTY_BASE_SEEN to a nonzero value iff a C++ empty base class makes
+   an appearance; separate flag bits indicate whether or not such a
+   field is marked "no unique address".  Modify ZERO_WIDTH_BF_SEEN
+   to 1 iff a C++ zero-length bitfield makes an appearance, but
+   in this case otherwise treat this as still being a homogeneous
+   aggregate.  */
 
 static int
 rs6000_aggregate_candidate (const_tree type, machine_mode *modep,
-			    int *empty_base_seen)
+			    int *empty_base_seen, int *zero_width_bf_seen)
 {
   machine_mode mode;
   HOST_WIDE_INT size;
@@ -6298,7 +6306,8 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep,
 	  return -1;
 
 	count = rs6000_aggregate_candidate (TREE_TYPE (type), modep,
-					    empty_base_seen);
+					    empty_base_seen,
+					    zero_width_bf_seen);
 	if (count == -1
 	    || !index
 	    || !TYPE_MAX_VALUE (index)
@@ -6336,6 +6345,26 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep,
 	    if (TREE_CODE (field) != FIELD_DECL)
 	      continue;
 
+	    if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
+	      {
+		/* GCC 11 and earlier generated incorrect code in a rare
+		   corner case for C++.  When a RECORD_TYPE looks like a
+		   homogeneous aggregate, except that it also contains
+		   one or more zero-width bit fields, these earlier
+		   compilers would incorrectly pass the fields in FPRs
+		   or VSRs.  This occurred because the front end wrongly
+		   removed these bitfields from the RECORD_TYPE.  In
+		   GCC 12 and later, the front end flaw was corrected.
+		   We want to diagnose this case.  To do this, we pretend
+		   that we don't see the zero-width bit fields (hence
+		   the continue statement here), but pass back a flag
+		   indicating what happened.  The caller then diagnoses
+		   the issue and rejects the RECORD_TYPE as a homogeneous
+		   aggregate.  */
+		*zero_width_bf_seen = 1;
+		continue;
+	      }
+
 	    if (DECL_FIELD_ABI_IGNORED (field))
 	      {
 		if (lookup_attribute ("no_unique_address",
@@ -6347,7 +6376,8 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep,
 	      }
 
 	    sub_count = rs6000_aggregate_candidate (TREE_TYPE (field), modep,
-						    empty_base_seen);
+						    empty_base_seen,
+						    zero_width_bf_seen);
 	    if (sub_count < 0)
 	      return -1;
 	    count += sub_count;
@@ -6381,7 +6411,8 @@ rs6000_aggregate_candidate (const_tree type, machine_mode *modep,
 	      continue;
 
 	    sub_count = rs6000_aggregate_candidate (TREE_TYPE (field), modep,
-						    empty_base_seen);
+						    empty_base_seen,
+						    zero_width_bf_seen);
 	    if (sub_count < 0)
 	      return -1;
 	    count = count > sub_count ? count : sub_count;
@@ -6423,8 +6454,10 @@ rs6000_discover_homogeneous_aggregate (machine_mode mode, const_tree type,
     {
       machine_mode field_mode = VOIDmode;
       int empty_base_seen = 0;
+      int zero_width_bf_seen = 0;
       int field_count = rs6000_aggregate_candidate (type, &field_mode,
-						    &empty_base_seen);
+						    &empty_base_seen,
+						    &zero_width_bf_seen);
 
       if (field_count > 0)
 	{
@@ -6460,6 +6493,25 @@ rs6000_discover_homogeneous_aggregate (machine_mode mode, const_tree type,
 		      last_reported_type_uid = uid;
 		    }
 		}
+	      if (zero_width_bf_seen && warn_psabi)
+		{
+		  static unsigned last_reported_type_uid;
+		  unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
+		  if (uid != last_reported_type_uid)
+		    {
+		      inform (input_location,
+			      "ELFv2 parameter passing for an argument "
+			      "containing zero-width bit fields but that is "
+			      "otherwise a homogeneous aggregate was "
+			      "corrected in GCC 12");
+		      last_reported_type_uid = uid;
+		    }
+		  if (elt_mode)
+		    *elt_mode = mode;
+		  if (n_elts)
+		    *n_elts = 1;
+		  return false;
+		}
 	      return true;
 	    }
 	}
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 060f51a..ad86072 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5289,9 +5289,6 @@ struct rs6000_cost_data
 static void
 rs6000_density_test (rs6000_cost_data *data)
 {
-  const int DENSITY_PCT_THRESHOLD = 85;
-  const int DENSITY_SIZE_THRESHOLD = 70;
-  const int DENSITY_PENALTY = 10;
   struct loop *loop = data->loop_info;
   basic_block *bbs = get_loop_body (loop);
   int nbbs = loop->num_nodes;
@@ -5327,26 +5324,21 @@ rs6000_density_test (rs6000_cost_data *data)
   free (bbs);
   density_pct = (vec_cost * 100) / (vec_cost + not_vec_cost);
 
-  if (density_pct > DENSITY_PCT_THRESHOLD
-      && vec_cost + not_vec_cost > DENSITY_SIZE_THRESHOLD)
+  if (density_pct > rs6000_density_pct_threshold
+      && vec_cost + not_vec_cost > rs6000_density_size_threshold)
     {
-      data->cost[vect_body] = vec_cost * (100 + DENSITY_PENALTY) / 100;
+      data->cost[vect_body] = vec_cost * (100 + rs6000_density_penalty) / 100;
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "density %d%%, cost %d exceeds threshold, penalizing "
-			 "loop body cost by %d%%\n", density_pct,
-			 vec_cost + not_vec_cost, DENSITY_PENALTY);
+			 "loop body cost by %u%%\n", density_pct,
+			 vec_cost + not_vec_cost, rs6000_density_penalty);
     }
 
   /* Check whether we need to penalize the body cost to account
      for excess strided or elementwise loads.  */
   if (data->extra_ctor_cost > 0)
     {
-      /* Threshold for load stmts percentage in all vectorized stmts.  */
-      const int DENSITY_LOAD_PCT_THRESHOLD = 45;
-      /* Threshold for total number of load stmts.  */
-      const int DENSITY_LOAD_NUM_THRESHOLD = 20;
-
       gcc_assert (data->nloads <= data->nstmts);
       unsigned int load_pct = (data->nloads * 100) / data->nstmts;
 
@@ -5360,8 +5352,8 @@ rs6000_density_test (rs6000_cost_data *data)
 	      the loads.
 	 One typical case is the innermost loop of the hotspot of SPEC2017
 	 503.bwaves_r without loop interchange.  */
-      if (data->nloads > DENSITY_LOAD_NUM_THRESHOLD
-	  && load_pct > DENSITY_LOAD_PCT_THRESHOLD)
+      if (data->nloads > (unsigned int) rs6000_density_load_num_threshold
+	  && load_pct > (unsigned int) rs6000_density_load_pct_threshold)
 	{
 	  data->cost[vect_body] += data->extra_ctor_cost;
 	  if (dump_enabled_p ())
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index c1cb9ab..9d7878f 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -639,3 +639,41 @@ Enable instructions that guard against return-oriented programming attacks.
 mprivileged
 Target Var(rs6000_privileged) Init(0)
 Generate code that will run in privileged state.
+
+-param=rs6000-density-pct-threshold=
+Target Undocumented Joined UInteger Var(rs6000_density_pct_threshold) Init(85) IntegerRange(0, 100) Param
+When costing for loop vectorization, we probably need to penalize the loop body
+cost if the existing cost model may not adequately reflect delays from
+unavailable vector resources.  We collect the cost for vectorized statements
+and non-vectorized statements separately, check the proportion of vec_cost to
+total cost of vec_cost and non vec_cost, and penalize only if the proportion
+exceeds the threshold specified by this parameter.  The default value is 85.
+
+-param=rs6000-density-size-threshold=
+Target Undocumented Joined UInteger Var(rs6000_density_size_threshold) Init(70) IntegerRange(0, 1000) Param
+Like parameter rs6000-density-pct-threshold, we also check the total sum of
+vec_cost and non vec_cost, and penalize only if the sum exceeds the threshold
+specified by this parameter.  The default value is 70.
+
+-param=rs6000-density-penalty=
+Target Undocumented Joined UInteger Var(rs6000_density_penalty) Init(10) IntegerRange(0, 1000) Param
+When both heuristics with rs6000-density-pct-threshold and
+rs6000-density-size-threshold are satisfied, we decide to penalize the loop
+body cost by the value which is specified by this parameter.  The default
+value is 10.
+
+-param=rs6000-density-load-pct-threshold=
+Target Undocumented Joined UInteger Var(rs6000_density_load_pct_threshold) Init(45) IntegerRange(0, 100) Param
+When costing for loop vectorization, we probably need to penalize the loop body
+cost by accounting for excess strided or elementwise loads.  We collect the
+numbers for general statements and load statements according to the information
+for statements to be vectorized, check the proportion of load statements, and
+penalize only if the proportion exceeds the threshold specified by this
+parameter.  The default value is 45.
+
+-param=rs6000-density-load-num-threshold=
+Target Undocumented Joined UInteger Var(rs6000_density_load_num_threshold) Init(20) IntegerRange(0, 1000) Param
+Like parameter rs6000-density-load-pct-threshold, we also check if the total
+number of load statements exceeds the threshold specified by this parameter,
+and penalize only if it's satisfied.  The default value is 20.
+
diff --git a/gcc/config/rs6000/vxworks.h b/gcc/config/rs6000/vxworks.h
index 5facbbb..d8ecc02 100644
--- a/gcc/config/rs6000/vxworks.h
+++ b/gcc/config/rs6000/vxworks.h
@@ -147,10 +147,6 @@ along with GCC; see the file COPYING3.  If not see
 #undef FUNCTION_PROFILER
 #define FUNCTION_PROFILER(FILE,LABELNO) VXWORKS_FUNCTION_PROFILER(FILE,LABELNO)
 
-/* Initialize library function table.  */
-#undef TARGET_INIT_LIBFUNCS
-#define TARGET_INIT_LIBFUNCS rs6000_vxworks_init_libfuncs
-
 /* Nor sdata, for kernel mode.  We use this in
    SUBSUBTARGET_INITIALIZE_OPTIONS, after rs6000_rtp has been initialized.  */
 #undef SDATA_DEFAULT_SIZE
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 54dd633..e043854 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -6414,6 +6414,15 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
   if (bitsize + bitpos > GET_MODE_BITSIZE (mode))
     return false;
 
+  /* Just a move.  */
+  if (bitpos == 0
+      && bitsize == GET_MODE_BITSIZE (GET_MODE (src))
+      && mode == GET_MODE (src))
+    {
+      emit_move_insn (dest, src);
+      return true;
+    }
+
   /* Generate INSERT IMMEDIATE (IILL et al).  */
   /* (set (ze (reg)) (const_int)).  */
   if (TARGET_ZARCH
@@ -6510,6 +6519,7 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
       && (bitpos & 32) == ((bitpos + bitsize - 1) & 32)
       && MEM_P (src)
       && (mode == DImode || mode == SImode)
+      && mode != smode
       && register_operand (dest, mode))
     {
       /* Emit a strict_low_part pattern if possible.  */
diff --git a/gcc/config/s390/tpf.md b/gcc/config/s390/tpf.md
index 297e9d1..35b3719 100644
--- a/gcc/config/s390/tpf.md
+++ b/gcc/config/s390/tpf.md
@@ -21,7 +21,8 @@
   [(unspec_volatile [(match_operand 0 "const_int_operand" "J")
 		     (match_operand 1 "const_int_operand" "J")]
 		    UNSPECV_TPF_PROLOGUE)
-   (clobber (reg:DI 1))]
+   (clobber (reg:DI 1))
+   (clobber (reg:CC CC_REGNUM))]
   "TARGET_TPF_PROFILING"
   "larl\t%%r1,.+14\;tm\t%0,255\;bnz\t%1"
   [(set_attr "length"   "14")])
@@ -31,7 +32,8 @@
   [(unspec_volatile [(match_operand 0 "const_int_operand" "J")
 		     (match_operand 1 "const_int_operand" "J")]
 		    UNSPECV_TPF_EPILOGUE)
-   (clobber (reg:DI 1))]
+   (clobber (reg:DI 1))
+   (clobber (reg:CC CC_REGNUM))]
   "TARGET_TPF_PROFILING"
   "larl\t%%r1,.+14\;tm\t%0,255\;bnz\t%1"
   [(set_attr "length"   "14")])
author	Ian Lance Taylor <iant@golang.org>	2021-10-07 15:28:36 -0700
committer	Ian Lance Taylor <iant@golang.org>	2021-10-07 15:28:36 -0700
commit	0b6b70a0733672600644c8df96942cda5bf86d3d (patch)
tree	9a1fbd7f782c54df55ab225ed1be057e3f3b0b8a /gcc/config
parent	a5b5cabc91c38710adbe5c8a2b53882abe994441 (diff)
parent	fba228e259dd5112851527f2dbb62c5601100985 (diff)
download	gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.zip gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.tar.gz gcc-0b6b70a0733672600644c8df96942cda5bf86d3d.tar.bz2