Merge from trunk revision 89be17a1b231ade643f28fbe616d53377e069da8.

author: Ian Lance Taylor <iant@golang.org> 2021-09-17 08:46:39 -0700
committer: Ian Lance Taylor <iant@golang.org> 2021-09-17 08:46:39 -0700
commit: a0791d0ed4f147ef347e83f4aedc7ad03f1a2008 (patch)
tree: 7b3526910798e4cff7a7200d684383046bac6225 /gcc/config
parent: e252b51ccde010cbd2a146485d8045103cd99533 (diff)
parent: 89be17a1b231ade643f28fbe616d53377e069da8 (diff)
download: gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.zip
gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.tar.gz
gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.tar.bz2
36 files changed, 8988 insertions, 748 deletions
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index 90ba85e..4919d27 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -4966,8 +4966,8 @@ core_3, archs4x, archs4xd, archs4xd_slow"
 			(const_int 1))
 		    (label_ref (match_operand 1 "" ""))
 		    (pc)))
-	      (set (match_dup 0) (plus (match_dup 0) (const_int -1)))
-	      (unspec [(const_int 0)] UNSPEC_ARC_LP)
+	      (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))
+	      (unspec:SI [(const_int 0)] UNSPEC_ARC_LP)
 	      (clobber (match_dup 2))])]
   ""
 {
@@ -4996,8 +4996,8 @@ core_3, archs4x, archs4xd, archs4xd_slow"
 			  (const_int 1))
 		      (label_ref (match_operand 1 "" ""))
 		      (pc)))
-   (set (match_dup 0) (plus (match_dup 0) (const_int -1)))
-   (unspec [(const_int 0)] UNSPEC_ARC_LP)
+   (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))
+   (unspec:SI [(const_int 0)] UNSPEC_ARC_LP)
    (clobber (match_scratch:SI 2 "=X,&r"))]
   ""
   "@
diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h
index ed8ad84..a5041ed 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -192,6 +192,159 @@ _mm512_setzero_ph (void)
   return _mm512_set1_ph (0.0f);
 }
 
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_ph (void)
+{
+  __m128h __Y = __Y;
+  return __Y;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_ph (void)
+{
+  __m256h __Y = __Y;
+  return __Y;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_ph (void)
+{
+  __m512h __Y = __Y;
+  return __Y;
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_h (__m128h __A)
+{
+  return __A[0];
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsh_h (__m256h __A)
+{
+  return __A[0];
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsh_h (__m512h __A)
+{
+  return __A[0];
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_ps (__m512h __a)
+{
+  return (__m512) __a;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_pd (__m512h __a)
+{
+  return (__m512d) __a;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_si512 (__m512h __a)
+{
+  return (__m512i) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph512_ph128 (__m512h __A)
+{
+  union
+  {
+    __m128h a[4];
+    __m512h v;
+  } u = { .v = __A };
+  return u.a[0];
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph512_ph256 (__m512h __A)
+{
+  union
+  {
+    __m256h a[2];
+    __m512h v;
+  } u = { .v = __A };
+  return u.a[0];
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph128_ph512 (__m128h __A)
+{
+  union
+  {
+    __m128h a[4];
+    __m512h v;
+  } u;
+  u.a[0] = __A;
+  return u.v;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph256_ph512 (__m256h __A)
+{
+  union
+  {
+    __m256h a[2];
+    __m512h v;
+  } u;
+  u.a[0] = __A;
+  return u.v;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextph128_ph512 (__m128h __A)
+{
+  return (__m512h) _mm512_insertf32x4 (_mm512_setzero_ps (),
+				       (__m128) __A, 0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextph256_ph512 (__m256h __A)
+{
+  return (__m512h) _mm512_insertf64x4 (_mm512_setzero_pd (),
+				       (__m256d) __A, 0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps_ph (__m512 __a)
+{
+  return (__m512h) __a;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd_ph (__m512d __a)
+{
+  return (__m512h) __a;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_ph (__m512i __a)
+{
+  return (__m512h) __a;
+}
+
 /* Create a vector with element 0 as F and the rest zero.  */
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -229,15 +382,15 @@ extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
 {
-  return __builtin_ia32_vaddph_v32hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_addph512_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C)
 {
-  return __builtin_ia32_vaddph_v32hf_mask (__B, __C,
-					   _mm512_setzero_ph (), __A);
+  return __builtin_ia32_addph512_mask (__B, __C,
+				       _mm512_setzero_ph (), __A);
 }
 
 extern __inline __m512h
@@ -251,15 +404,15 @@ extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
 {
-  return __builtin_ia32_vsubph_v32hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_subph512_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C)
 {
-  return __builtin_ia32_vsubph_v32hf_mask (__B, __C,
-					   _mm512_setzero_ph (), __A);
+  return __builtin_ia32_subph512_mask (__B, __C,
+				       _mm512_setzero_ph (), __A);
 }
 
 extern __inline __m512h
@@ -273,15 +426,15 @@ extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
 {
-  return __builtin_ia32_vmulph_v32hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_mulph512_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C)
 {
-  return __builtin_ia32_vmulph_v32hf_mask (__B, __C,
-					   _mm512_setzero_ph (), __A);
+  return __builtin_ia32_mulph512_mask (__B, __C,
+				       _mm512_setzero_ph (), __A);
 }
 
 extern __inline __m512h
@@ -295,15 +448,15 @@ extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
 {
-  return __builtin_ia32_vdivph_v32hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_divph512_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C)
 {
-  return __builtin_ia32_vdivph_v32hf_mask (__B, __C,
-					   _mm512_setzero_ph (), __A);
+  return __builtin_ia32_divph512_mask (__B, __C,
+				       _mm512_setzero_ph (), __A);
 }
 
 #ifdef __OPTIMIZE__
@@ -311,9 +464,9 @@ extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_add_round_ph (__m512h __A, __m512h __B, const int __C)
 {
-  return __builtin_ia32_vaddph_v32hf_mask_round (__A, __B,
-						 _mm512_setzero_ph (),
-						 (__mmask32) -1, __C);
+  return __builtin_ia32_addph512_mask_round (__A, __B,
+					     _mm512_setzero_ph (),
+					     (__mmask32) -1, __C);
 }
 
 extern __inline __m512h
@@ -321,7 +474,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
 			  __m512h __D, const int __E)
 {
-  return __builtin_ia32_vaddph_v32hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_addph512_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m512h
@@ -329,18 +482,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
 			   const int __D)
 {
-  return __builtin_ia32_vaddph_v32hf_mask_round (__B, __C,
-						 _mm512_setzero_ph (),
-						 __A, __D);
+  return __builtin_ia32_addph512_mask_round (__B, __C,
+					     _mm512_setzero_ph (),
+					     __A, __D);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C)
 {
-  return __builtin_ia32_vsubph_v32hf_mask_round (__A, __B,
-						 _mm512_setzero_ph (),
-						 (__mmask32) -1, __C);
+  return __builtin_ia32_subph512_mask_round (__A, __B,
+					     _mm512_setzero_ph (),
+					     (__mmask32) -1, __C);
 }
 
 extern __inline __m512h
@@ -348,7 +501,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
 			  __m512h __D, const int __E)
 {
-  return __builtin_ia32_vsubph_v32hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_subph512_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m512h
@@ -356,18 +509,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
 			   const int __D)
 {
-  return __builtin_ia32_vsubph_v32hf_mask_round (__B, __C,
-						 _mm512_setzero_ph (),
-						 __A, __D);
+  return __builtin_ia32_subph512_mask_round (__B, __C,
+					     _mm512_setzero_ph (),
+					     __A, __D);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C)
 {
-  return __builtin_ia32_vmulph_v32hf_mask_round (__A, __B,
-						 _mm512_setzero_ph (),
-						 (__mmask32) -1, __C);
+  return __builtin_ia32_mulph512_mask_round (__A, __B,
+					     _mm512_setzero_ph (),
+					     (__mmask32) -1, __C);
 }
 
 extern __inline __m512h
@@ -375,7 +528,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
 			  __m512h __D, const int __E)
 {
-  return __builtin_ia32_vmulph_v32hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_mulph512_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m512h
@@ -383,18 +536,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
 			   const int __D)
 {
-  return __builtin_ia32_vmulph_v32hf_mask_round (__B, __C,
-						 _mm512_setzero_ph (),
-						 __A, __D);
+  return __builtin_ia32_mulph512_mask_round (__B, __C,
+					     _mm512_setzero_ph (),
+					     __A, __D);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_div_round_ph (__m512h __A, __m512h __B, const int __C)
 {
-  return __builtin_ia32_vdivph_v32hf_mask_round (__A, __B,
-						 _mm512_setzero_ph (),
-						 (__mmask32) -1, __C);
+  return __builtin_ia32_divph512_mask_round (__A, __B,
+					     _mm512_setzero_ph (),
+					     (__mmask32) -1, __C);
 }
 
 extern __inline __m512h
@@ -402,7 +555,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
 			  __m512h __D, const int __E)
 {
-  return __builtin_ia32_vdivph_v32hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_divph512_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m512h
@@ -410,67 +563,67 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
 			   const int __D)
 {
-  return __builtin_ia32_vdivph_v32hf_mask_round (__B, __C,
-						 _mm512_setzero_ph (),
-						 __A, __D);
+  return __builtin_ia32_divph512_mask_round (__B, __C,
+					     _mm512_setzero_ph (),
+					     __A, __D);
 }
 #else
 #define _mm512_add_round_ph(A, B, C)					\
-  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((A), (B),		\
-						   _mm512_setzero_ph (),\
-						   (__mmask32)-1, (C)))
+  ((__m512h)__builtin_ia32_addph512_mask_round((A), (B),		\
+					       _mm512_setzero_ph (),	\
+					       (__mmask32)-1, (C)))
 
-#define _mm512_mask_add_round_ph(A, B, C, D, E)			\
-  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((C), (D), (A), (B), (E)))
+#define _mm512_mask_add_round_ph(A, B, C, D, E)				\
+  ((__m512h)__builtin_ia32_addph512_mask_round((C), (D), (A), (B), (E)))
 
 #define _mm512_maskz_add_round_ph(A, B, C, D)				\
-  ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((B), (C),		\
-						   _mm512_setzero_ph (),\
-						   (A), (D)))
+  ((__m512h)__builtin_ia32_addph512_mask_round((B), (C),		\
+					       _mm512_setzero_ph (),	\
+					       (A), (D)))
 
 #define _mm512_sub_round_ph(A, B, C)					\
-  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((A), (B),		\
-						   _mm512_setzero_ph (),\
-						   (__mmask32)-1, (C)))
+  ((__m512h)__builtin_ia32_subph512_mask_round((A), (B),		\
+					       _mm512_setzero_ph (),	\
+					       (__mmask32)-1, (C)))
 
-#define _mm512_mask_sub_round_ph(A, B, C, D, E)			\
-  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((C), (D), (A), (B), (E)))
+#define _mm512_mask_sub_round_ph(A, B, C, D, E)				\
+  ((__m512h)__builtin_ia32_subph512_mask_round((C), (D), (A), (B), (E)))
 
 #define _mm512_maskz_sub_round_ph(A, B, C, D)				\
-  ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((B), (C),		\
-						   _mm512_setzero_ph (),\
-						   (A), (D)))
+  ((__m512h)__builtin_ia32_subph512_mask_round((B), (C),		\
+					       _mm512_setzero_ph (),	\
+					       (A), (D)))
 
 #define _mm512_mul_round_ph(A, B, C)					\
-  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((A), (B),		\
-						   _mm512_setzero_ph (),\
-						   (__mmask32)-1, (C)))
+  ((__m512h)__builtin_ia32_mulph512_mask_round((A), (B),		\
+					       _mm512_setzero_ph (),	\
+					       (__mmask32)-1, (C)))
 
-#define _mm512_mask_mul_round_ph(A, B, C, D, E)			\
-  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((C), (D), (A), (B), (E)))
+#define _mm512_mask_mul_round_ph(A, B, C, D, E)				\
+  ((__m512h)__builtin_ia32_mulph512_mask_round((C), (D), (A), (B), (E)))
 
 #define _mm512_maskz_mul_round_ph(A, B, C, D)				\
-  ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((B), (C),		\
-						   _mm512_setzero_ph (),\
-						   (A), (D)))
+  ((__m512h)__builtin_ia32_mulph512_mask_round((B), (C),		\
+					       _mm512_setzero_ph (),	\
+					       (A), (D)))
 
 #define _mm512_div_round_ph(A, B, C)					\
-  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((A), (B),		\
-						   _mm512_setzero_ph (),\
-						   (__mmask32)-1, (C)))
+  ((__m512h)__builtin_ia32_divph512_mask_round((A), (B),		\
+					       _mm512_setzero_ph (),	\
+					       (__mmask32)-1, (C)))
 
-#define _mm512_mask_div_round_ph(A, B, C, D, E)			\
-  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((C), (D), (A), (B), (E)))
+#define _mm512_mask_div_round_ph(A, B, C, D, E)				\
+  ((__m512h)__builtin_ia32_divph512_mask_round((C), (D), (A), (B), (E)))
 
 #define _mm512_maskz_div_round_ph(A, B, C, D)				\
-  ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((B), (C),		\
-						   _mm512_setzero_ph (),\
-						   (A), (D)))
+  ((__m512h)__builtin_ia32_divph512_mask_round((B), (C),		\
+					       _mm512_setzero_ph (),	\
+					       (A), (D)))
 #endif  /* __OPTIMIZE__  */
 
 /* Intrinsics of v[add,sub,mul,div]sh.  */
 extern __inline __m128h
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_sh (__m128h __A, __m128h __B)
 {
   __A[0] += __B[0];
@@ -481,15 +634,15 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_add_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vaddsh_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_addsh_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_add_sh (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vaddsh_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_addsh_mask (__B, __C, _mm_setzero_ph (),
+				    __A);
 }
 
 extern __inline __m128h
@@ -504,15 +657,15 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_sub_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vsubsh_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_subsh_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_sub_sh (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vsubsh_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_subsh_mask (__B, __C, _mm_setzero_ph (),
+				    __A);
 }
 
 extern __inline __m128h
@@ -527,14 +680,14 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_mul_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vmulsh_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_mulsh_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_mul_sh (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vmulsh_v8hf_mask (__B, __C, _mm_setzero_ph (), __A);
+  return __builtin_ia32_mulsh_mask (__B, __C, _mm_setzero_ph (), __A);
 }
 
 extern __inline __m128h
@@ -549,15 +702,15 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_div_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vdivsh_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_divsh_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_div_sh (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vdivsh_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_divsh_mask (__B, __C, _mm_setzero_ph (),
+				    __A);
 }
 
 #ifdef __OPTIMIZE__
@@ -565,9 +718,9 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_round_sh (__m128h __A, __m128h __B, const int __C)
 {
-  return __builtin_ia32_vaddsh_v8hf_mask_round (__A, __B,
-						_mm_setzero_ph (),
-						(__mmask8) -1, __C);
+  return __builtin_ia32_addsh_mask_round (__A, __B,
+					  _mm_setzero_ph (),
+					  (__mmask8) -1, __C);
 }
 
 extern __inline __m128h
@@ -575,7 +728,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_add_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
 		       __m128h __D, const int __E)
 {
-  return __builtin_ia32_vaddsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_addsh_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m128h
@@ -583,18 +736,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_add_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
 			const int __D)
 {
-  return __builtin_ia32_vaddsh_v8hf_mask_round (__B, __C,
-						_mm_setzero_ph (),
-						__A, __D);
+  return __builtin_ia32_addsh_mask_round (__B, __C,
+					  _mm_setzero_ph (),
+					  __A, __D);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_round_sh (__m128h __A, __m128h __B, const int __C)
 {
-  return __builtin_ia32_vsubsh_v8hf_mask_round (__A, __B,
-						_mm_setzero_ph (),
-						(__mmask8) -1, __C);
+  return __builtin_ia32_subsh_mask_round (__A, __B,
+					  _mm_setzero_ph (),
+					  (__mmask8) -1, __C);
 }
 
 extern __inline __m128h
@@ -602,7 +755,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_sub_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
 		       __m128h __D, const int __E)
 {
-  return __builtin_ia32_vsubsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_subsh_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m128h
@@ -610,18 +763,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_sub_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
 			const int __D)
 {
-  return __builtin_ia32_vsubsh_v8hf_mask_round (__B, __C,
-						_mm_setzero_ph (),
-						__A, __D);
+  return __builtin_ia32_subsh_mask_round (__B, __C,
+					  _mm_setzero_ph (),
+					  __A, __D);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_round_sh (__m128h __A, __m128h __B, const int __C)
 {
-  return __builtin_ia32_vmulsh_v8hf_mask_round (__A, __B,
-						_mm_setzero_ph (),
-						(__mmask8) -1, __C);
+  return __builtin_ia32_mulsh_mask_round (__A, __B,
+					  _mm_setzero_ph (),
+					  (__mmask8) -1, __C);
 }
 
 extern __inline __m128h
@@ -629,7 +782,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_mul_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
 		       __m128h __D, const int __E)
 {
-  return __builtin_ia32_vmulsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_mulsh_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m128h
@@ -637,18 +790,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_mul_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
 			const int __D)
 {
-  return __builtin_ia32_vmulsh_v8hf_mask_round (__B, __C,
-						_mm_setzero_ph (),
-						__A, __D);
+  return __builtin_ia32_mulsh_mask_round (__B, __C,
+					  _mm_setzero_ph (),
+					  __A, __D);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_round_sh (__m128h __A, __m128h __B, const int __C)
 {
-  return __builtin_ia32_vdivsh_v8hf_mask_round (__A, __B,
-						_mm_setzero_ph (),
-						(__mmask8) -1, __C);
+  return __builtin_ia32_divsh_mask_round (__A, __B,
+					  _mm_setzero_ph (),
+					  (__mmask8) -1, __C);
 }
 
 extern __inline __m128h
@@ -656,7 +809,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_div_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
 		       __m128h __D, const int __E)
 {
-  return __builtin_ia32_vdivsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_divsh_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m128h
@@ -664,62 +817,62 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_div_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
 			const int __D)
 {
-  return __builtin_ia32_vdivsh_v8hf_mask_round (__B, __C,
-						_mm_setzero_ph (),
-						__A, __D);
+  return __builtin_ia32_divsh_mask_round (__B, __C,
+					  _mm_setzero_ph (),
+					  __A, __D);
 }
 #else
 #define _mm_add_round_sh(A, B, C)					\
-  ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((A), (B),		\
-						   _mm_setzero_ph (),	\
-						   (__mmask8)-1, (C)))
+  ((__m128h)__builtin_ia32_addsh_mask_round ((A), (B),			\
+					     _mm_setzero_ph (),		\
+					     (__mmask8)-1, (C)))
 
 #define _mm_mask_add_round_sh(A, B, C, D, E)				\
-  ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+  ((__m128h)__builtin_ia32_addsh_mask_round ((C), (D), (A), (B), (E)))
 
-#define _mm_maskz_add_round_sh(A, B, C, D)				\
-  ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((B), (C),		\
-						   _mm_setzero_ph (),	\
-						   (A), (D)))
+#define _mm_maskz_add_round_sh(A, B, C, D)			\
+  ((__m128h)__builtin_ia32_addsh_mask_round ((B), (C),		\
+					     _mm_setzero_ph (),	\
+					     (A), (D)))
 
 #define _mm_sub_round_sh(A, B, C)					\
-  ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((A), (B),		\
-						   _mm_setzero_ph (),	\
-						   (__mmask8)-1, (C)))
+  ((__m128h)__builtin_ia32_subsh_mask_round ((A), (B),			\
+					     _mm_setzero_ph (),		\
+					     (__mmask8)-1, (C)))
 
 #define _mm_mask_sub_round_sh(A, B, C, D, E)				\
-  ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+  ((__m128h)__builtin_ia32_subsh_mask_round ((C), (D), (A), (B), (E)))
 
-#define _mm_maskz_sub_round_sh(A, B, C, D)				\
-  ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((B), (C),		\
-						   _mm_setzero_ph (),	\
-						   (A), (D)))
+#define _mm_maskz_sub_round_sh(A, B, C, D)			\
+  ((__m128h)__builtin_ia32_subsh_mask_round ((B), (C),		\
+					     _mm_setzero_ph (),	\
+					     (A), (D)))
 
 #define _mm_mul_round_sh(A, B, C)					\
-  ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((A), (B),		\
-						   _mm_setzero_ph (),	\
-						   (__mmask8)-1, (C)))
+  ((__m128h)__builtin_ia32_mulsh_mask_round ((A), (B),			\
+					     _mm_setzero_ph (),		\
+					     (__mmask8)-1, (C)))
 
 #define _mm_mask_mul_round_sh(A, B, C, D, E)				\
-  ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+  ((__m128h)__builtin_ia32_mulsh_mask_round ((C), (D), (A), (B), (E)))
 
-#define _mm_maskz_mul_round_sh(A, B, C, D)				\
-  ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((B), (C),		\
-						   _mm_setzero_ph (),	\
-						   (A), (D)))
+#define _mm_maskz_mul_round_sh(A, B, C, D)			\
+  ((__m128h)__builtin_ia32_mulsh_mask_round ((B), (C),		\
+					     _mm_setzero_ph (),	\
+					     (A), (D)))
 
 #define _mm_div_round_sh(A, B, C)					\
-  ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((A), (B),		\
-						   _mm_setzero_ph (),	\
-						   (__mmask8)-1, (C)))
+  ((__m128h)__builtin_ia32_divsh_mask_round ((A), (B),			\
+					     _mm_setzero_ph (),		\
+					     (__mmask8)-1, (C)))
 
 #define _mm_mask_div_round_sh(A, B, C, D, E)				\
-  ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+  ((__m128h)__builtin_ia32_divsh_mask_round ((C), (D), (A), (B), (E)))
 
-#define _mm_maskz_div_round_sh(A, B, C, D)				\
-  ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((B), (C),		\
-						   _mm_setzero_ph (),	\
-						   (A), (D)))
+#define _mm_maskz_div_round_sh(A, B, C, D)			\
+  ((__m128h)__builtin_ia32_divsh_mask_round ((B), (C),		\
+					     _mm_setzero_ph (),	\
+					     (A), (D)))
 #endif /* __OPTIMIZE__ */
 
 /* Intrinsic vmaxph vminph.  */
@@ -727,48 +880,48 @@ extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_max_ph (__m512h __A, __m512h __B)
 {
-  return __builtin_ia32_vmaxph_v32hf_mask (__A, __B,
-					   _mm512_setzero_ph (),
-					   (__mmask32) -1);
+  return __builtin_ia32_maxph512_mask (__A, __B,
+				       _mm512_setzero_ph (),
+				       (__mmask32) -1);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_max_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
 {
-  return __builtin_ia32_vmaxph_v32hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_maxph512_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_max_ph (__mmask32 __A, __m512h __B, __m512h __C)
 {
-  return __builtin_ia32_vmaxph_v32hf_mask (__B, __C,
-					   _mm512_setzero_ph (), __A);
+  return __builtin_ia32_maxph512_mask (__B, __C,
+				       _mm512_setzero_ph (), __A);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_min_ph (__m512h __A, __m512h __B)
 {
-  return __builtin_ia32_vminph_v32hf_mask (__A, __B,
-					   _mm512_setzero_ph (),
-					   (__mmask32) -1);
+  return __builtin_ia32_minph512_mask (__A, __B,
+				       _mm512_setzero_ph (),
+				       (__mmask32) -1);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_min_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
 {
-  return __builtin_ia32_vminph_v32hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_minph512_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_min_ph (__mmask32 __A, __m512h __B, __m512h __C)
 {
-  return __builtin_ia32_vminph_v32hf_mask (__B, __C,
-					   _mm512_setzero_ph (), __A);
+  return __builtin_ia32_minph512_mask (__B, __C,
+				       _mm512_setzero_ph (), __A);
 }
 
 #ifdef __OPTIMIZE__
@@ -776,9 +929,9 @@ extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_max_round_ph (__m512h __A, __m512h __B, const int __C)
 {
-  return __builtin_ia32_vmaxph_v32hf_mask_round (__A, __B,
-						 _mm512_setzero_ph (),
-						 (__mmask32) -1, __C);
+  return __builtin_ia32_maxph512_mask_round (__A, __B,
+					     _mm512_setzero_ph (),
+					     (__mmask32) -1, __C);
 }
 
 extern __inline __m512h
@@ -786,7 +939,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_max_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
 			  __m512h __D, const int __E)
 {
-  return __builtin_ia32_vmaxph_v32hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_maxph512_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m512h
@@ -794,18 +947,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_max_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
 			   const int __D)
 {
-  return __builtin_ia32_vmaxph_v32hf_mask_round (__B, __C,
-						 _mm512_setzero_ph (),
-						 __A, __D);
+  return __builtin_ia32_maxph512_mask_round (__B, __C,
+					     _mm512_setzero_ph (),
+					     __A, __D);
 }
 
 extern __inline __m512h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_min_round_ph (__m512h __A, __m512h __B, const int __C)
 {
-  return __builtin_ia32_vminph_v32hf_mask_round (__A, __B,
-						 _mm512_setzero_ph (),
-						 (__mmask32) -1, __C);
+  return __builtin_ia32_minph512_mask_round (__A, __B,
+					     _mm512_setzero_ph (),
+					     (__mmask32) -1, __C);
 }
 
 extern __inline __m512h
@@ -813,7 +966,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_min_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
 			  __m512h __D, const int __E)
 {
-  return __builtin_ia32_vminph_v32hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_minph512_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m512h
@@ -821,37 +974,37 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_min_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
 			   const int __D)
 {
-  return __builtin_ia32_vminph_v32hf_mask_round (__B, __C,
-						 _mm512_setzero_ph (),
-						 __A, __D);
+  return __builtin_ia32_minph512_mask_round (__B, __C,
+					     _mm512_setzero_ph (),
+					     __A, __D);
 }
 
 #else
-#define _mm512_max_round_ph(A, B, C)					\
-  (__builtin_ia32_vmaxph_v32hf_mask_round ((A), (B),			\
-					   _mm512_setzero_ph (),	\
-					   (__mmask32)-1, (C)))
+#define _mm512_max_round_ph(A, B, C)				\
+  (__builtin_ia32_maxph512_mask_round ((A), (B),		\
+				       _mm512_setzero_ph (),	\
+				       (__mmask32)-1, (C)))
 
 #define _mm512_mask_max_round_ph(A, B, C, D, E)				\
-  (__builtin_ia32_vmaxph_v32hf_mask_round ((C), (D), (A), (B), (E)))
+  (__builtin_ia32_maxph512_mask_round ((C), (D), (A), (B), (E)))
 
-#define _mm512_maskz_max_round_ph(A, B, C, D)				\
-  (__builtin_ia32_vmaxph_v32hf_mask_round ((B), (C),			\
-					   _mm512_setzero_ph (),	\
-					   (A), (D)))
+#define _mm512_maskz_max_round_ph(A, B, C, D)			\
+  (__builtin_ia32_maxph512_mask_round ((B), (C),		\
+				       _mm512_setzero_ph (),	\
+				       (A), (D)))
 
-#define _mm512_min_round_ph(A, B, C)					\
-  (__builtin_ia32_vminph_v32hf_mask_round ((A), (B),			\
-					   _mm512_setzero_ph (),	\
-					   (__mmask32)-1, (C)))
+#define _mm512_min_round_ph(A, B, C)				\
+  (__builtin_ia32_minph512_mask_round ((A), (B),		\
+				       _mm512_setzero_ph (),	\
+				       (__mmask32)-1, (C)))
 
 #define _mm512_mask_min_round_ph(A, B, C, D, E)				\
-  (__builtin_ia32_vminph_v32hf_mask_round ((C), (D), (A), (B), (E)))
+  (__builtin_ia32_minph512_mask_round ((C), (D), (A), (B), (E)))
 
-#define _mm512_maskz_min_round_ph(A, B, C, D)				\
-  (__builtin_ia32_vminph_v32hf_mask_round ((B), (C),			\
-					   _mm512_setzero_ph (),	\
-					   (A), (D)))
+#define _mm512_maskz_min_round_ph(A, B, C, D)			\
+  (__builtin_ia32_minph512_mask_round ((B), (C),		\
+				       _mm512_setzero_ph (),	\
+				       (A), (D)))
 #endif /* __OPTIMIZE__ */
 
 /* Intrinsic vmaxsh vminsh.  */
@@ -867,15 +1020,15 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_max_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vmaxsh_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_maxsh_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_max_sh (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vmaxsh_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_maxsh_mask (__B, __C, _mm_setzero_ph (),
+				    __A);
 }
 
 extern __inline __m128h
@@ -890,15 +1043,15 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_min_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vminsh_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_minsh_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_min_sh (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vminsh_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_minsh_mask (__B, __C, _mm_setzero_ph (),
+				    __A);
 }
 
 #ifdef __OPTIMIZE__
@@ -906,9 +1059,9 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_max_round_sh (__m128h __A, __m128h __B, const int __C)
 {
-  return __builtin_ia32_vmaxsh_v8hf_mask_round (__A, __B,
-						_mm_setzero_ph (),
-						(__mmask8) -1, __C);
+  return __builtin_ia32_maxsh_mask_round (__A, __B,
+					  _mm_setzero_ph (),
+					  (__mmask8) -1, __C);
 }
 
 extern __inline __m128h
@@ -916,7 +1069,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_max_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
 		       __m128h __D, const int __E)
 {
-  return __builtin_ia32_vmaxsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_maxsh_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m128h
@@ -924,18 +1077,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_max_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
 			const int __D)
 {
-  return __builtin_ia32_vmaxsh_v8hf_mask_round (__B, __C,
-						_mm_setzero_ph (),
-						__A, __D);
+  return __builtin_ia32_maxsh_mask_round (__B, __C,
+					  _mm_setzero_ph (),
+					  __A, __D);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_round_sh (__m128h __A, __m128h __B, const int __C)
 {
-  return __builtin_ia32_vminsh_v8hf_mask_round (__A, __B,
-						_mm_setzero_ph (),
-						(__mmask8) -1, __C);
+  return __builtin_ia32_minsh_mask_round (__A, __B,
+					  _mm_setzero_ph (),
+					  (__mmask8) -1, __C);
 }
 
 extern __inline __m128h
@@ -943,7 +1096,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_min_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
 		       __m128h __D, const int __E)
 {
-  return __builtin_ia32_vminsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+  return __builtin_ia32_minsh_mask_round (__C, __D, __A, __B, __E);
 }
 
 extern __inline __m128h
@@ -951,37 +1104,37 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_min_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
 			const int __D)
 {
-  return __builtin_ia32_vminsh_v8hf_mask_round (__B, __C,
-						_mm_setzero_ph (),
-						__A, __D);
+  return __builtin_ia32_minsh_mask_round (__B, __C,
+					  _mm_setzero_ph (),
+					  __A, __D);
 }
 
 #else
-#define _mm_max_round_sh(A, B, C)					\
-  (__builtin_ia32_vmaxsh_v8hf_mask_round ((A), (B),			\
-					  _mm_setzero_ph (),		\
-					  (__mmask8)-1, (C)))
+#define _mm_max_round_sh(A, B, C)			\
+  (__builtin_ia32_maxsh_mask_round ((A), (B),		\
+				    _mm_setzero_ph (),	\
+				    (__mmask8)-1, (C)))
 
-#define _mm_mask_max_round_sh(A, B, C, D, E)				\
-  (__builtin_ia32_vmaxsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+#define _mm_mask_max_round_sh(A, B, C, D, E)			\
+  (__builtin_ia32_maxsh_mask_round ((C), (D), (A), (B), (E)))
 
-#define _mm_maskz_max_round_sh(A, B, C, D)				\
-  (__builtin_ia32_vmaxsh_v8hf_mask_round ((B), (C),			\
-					  _mm_setzero_ph (),		\
-					  (A), (D)))
+#define _mm_maskz_max_round_sh(A, B, C, D)		\
+  (__builtin_ia32_maxsh_mask_round ((B), (C),		\
+				    _mm_setzero_ph (),	\
+				    (A), (D)))
 
-#define _mm_min_round_sh(A, B, C)					\
-  (__builtin_ia32_vminsh_v8hf_mask_round ((A), (B),			\
-					  _mm_setzero_ph (),		\
-					  (__mmask8)-1, (C)))
+#define _mm_min_round_sh(A, B, C)			\
+  (__builtin_ia32_minsh_mask_round ((A), (B),		\
+				    _mm_setzero_ph (),	\
+				    (__mmask8)-1, (C)))
 
-#define _mm_mask_min_round_sh(A, B, C, D, E)				\
-  (__builtin_ia32_vminsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+#define _mm_mask_min_round_sh(A, B, C, D, E)			\
+  (__builtin_ia32_minsh_mask_round ((C), (D), (A), (B), (E)))
 
-#define _mm_maskz_min_round_sh(A, B, C, D)				\
-  (__builtin_ia32_vminsh_v8hf_mask_round ((B), (C),			\
-					  _mm_setzero_ph (),		\
-					  (A), (D)))
+#define _mm_maskz_min_round_sh(A, B, C, D)		\
+  (__builtin_ia32_minsh_mask_round ((B), (C),		\
+				    _mm_setzero_ph (),	\
+				    (A), (D)))
 
 #endif /* __OPTIMIZE__ */
 
@@ -991,8 +1144,8 @@ extern __inline __mmask32
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_cmp_ph_mask (__m512h __A, __m512h __B, const int __C)
 {
-  return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask (__A, __B, __C,
-						       (__mmask32) -1);
+  return (__mmask32) __builtin_ia32_cmpph512_mask (__A, __B, __C,
+						   (__mmask32) -1);
 }
 
 extern __inline __mmask32
@@ -1000,8 +1153,8 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_cmp_ph_mask (__mmask32 __A, __m512h __B, __m512h __C,
 			 const int __D)
 {
-  return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask (__B, __C, __D,
-						       __A);
+  return (__mmask32) __builtin_ia32_cmpph512_mask (__B, __C, __D,
+						   __A);
 }
 
 extern __inline __mmask32
@@ -1009,9 +1162,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_cmp_round_ph_mask (__m512h __A, __m512h __B, const int __C,
 			  const int __D)
 {
-  return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask_round (__A, __B,
-							     __C, (__mmask32) -1,
-							     __D);
+  return (__mmask32) __builtin_ia32_cmpph512_mask_round (__A, __B,
+							 __C, (__mmask32) -1,
+							 __D);
 }
 
 extern __inline __mmask32
@@ -1019,23 +1172,23 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_cmp_round_ph_mask (__mmask32 __A, __m512h __B, __m512h __C,
 			       const int __D, const int __E)
 {
-  return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask_round (__B, __C,
-							     __D, __A,
-							     __E);
+  return (__mmask32) __builtin_ia32_cmpph512_mask_round (__B, __C,
+							 __D, __A,
+							 __E);
 }
 
 #else
 #define _mm512_cmp_ph_mask(A, B, C)			\
-  (__builtin_ia32_vcmpph_v32hf_mask ((A), (B), (C), (-1)))
+  (__builtin_ia32_cmpph512_mask ((A), (B), (C), (-1)))
 
 #define _mm512_mask_cmp_ph_mask(A, B, C, D)		\
-  (__builtin_ia32_vcmpph_v32hf_mask ((B), (C), (D), (A)))
+  (__builtin_ia32_cmpph512_mask ((B), (C), (D), (A)))
 
-#define _mm512_cmp_round_ph_mask(A, B, C, D)		\
-  (__builtin_ia32_vcmpph_v32hf_mask_round ((A), (B), (C), (-1), (D)))
+#define _mm512_cmp_round_ph_mask(A, B, C, D)				\
+  (__builtin_ia32_cmpph512_mask_round ((A), (B), (C), (-1), (D)))
 
-#define _mm512_mask_cmp_round_ph_mask(A, B, C, D, E)	\
-  (__builtin_ia32_vcmpph_v32hf_mask_round ((B), (C), (D), (A), (E)))
+#define _mm512_mask_cmp_round_ph_mask(A, B, C, D, E)			\
+  (__builtin_ia32_cmpph512_mask_round ((B), (C), (D), (A), (E)))
 
 #endif /* __OPTIMIZE__ */
 
@@ -1046,9 +1199,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmp_sh_mask (__m128h __A, __m128h __B, const int __C)
 {
   return (__mmask8)
-    __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B,
-					   __C, (__mmask8) -1,
-					   _MM_FROUND_CUR_DIRECTION);
+    __builtin_ia32_cmpsh_mask_round (__A, __B,
+				     __C, (__mmask8) -1,
+				     _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __mmask8
@@ -1057,9 +1210,9 @@ _mm_mask_cmp_sh_mask (__mmask8 __A, __m128h __B, __m128h __C,
 		      const int __D)
 {
   return (__mmask8)
-    __builtin_ia32_vcmpsh_v8hf_mask_round (__B, __C,
-					   __D, __A,
-					   _MM_FROUND_CUR_DIRECTION);
+    __builtin_ia32_cmpsh_mask_round (__B, __C,
+				     __D, __A,
+				     _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __mmask8
@@ -1067,9 +1220,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmp_round_sh_mask (__m128h __A, __m128h __B, const int __C,
 		       const int __D)
 {
-  return (__mmask8) __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B,
-							   __C, (__mmask8) -1,
-							   __D);
+  return (__mmask8) __builtin_ia32_cmpsh_mask_round (__A, __B,
+						     __C, (__mmask8) -1,
+						     __D);
 }
 
 extern __inline __mmask8
@@ -1077,25 +1230,25 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_cmp_round_sh_mask (__mmask8 __A, __m128h __B, __m128h __C,
 			    const int __D, const int __E)
 {
-  return (__mmask8) __builtin_ia32_vcmpsh_v8hf_mask_round (__B, __C,
-							   __D, __A,
-							   __E);
+  return (__mmask8) __builtin_ia32_cmpsh_mask_round (__B, __C,
+						     __D, __A,
+						     __E);
 }
 
 #else
-#define _mm_cmp_sh_mask(A, B, C)		\
-  (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (C), (-1), \
-					  (_MM_FROUND_CUR_DIRECTION)))
+#define _mm_cmp_sh_mask(A, B, C)					\
+  (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1),		\
+				    (_MM_FROUND_CUR_DIRECTION)))
 
-#define _mm_mask_cmp_sh_mask(A, B, C, D)	\
-  (__builtin_ia32_vcmpsh_v8hf_mask_round ((B), (C), (D), (A),		\
-					  (_MM_FROUND_CUR_DIRECTION)))
+#define _mm_mask_cmp_sh_mask(A, B, C, D)				\
+  (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A),			\
+				    (_MM_FROUND_CUR_DIRECTION)))
 
-#define _mm_cmp_round_sh_mask(A, B, C, D)				\
-  (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (C), (-1), (D)))
+#define _mm_cmp_round_sh_mask(A, B, C, D)			\
+  (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1), (D)))
 
-#define _mm_mask_cmp_round_sh_mask(A, B, C, D, E)	\
-  (__builtin_ia32_vcmpsh_v8hf_mask_round ((B), (C), (D), (A), (E)))
+#define _mm_mask_cmp_round_sh_mask(A, B, C, D, E)		\
+  (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A), (E)))
 
 #endif /* __OPTIMIZE__ */
 
@@ -1104,137 +1257,3792 @@ extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_comieq_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_EQ_OS,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OS,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_comilt_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LT_OS,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OS,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_comile_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LE_OS,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OS,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_comigt_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GT_OS,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OS,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_comige_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GE_OS,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OS,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_comineq_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_NEQ_US,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_US,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_ucomieq_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_EQ_OQ,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OQ,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_ucomilt_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LT_OQ,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OQ,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_ucomile_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LE_OQ,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OQ,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_ucomigt_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GT_OQ,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OQ,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_ucomige_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GE_OQ,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OQ,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_ucomineq_sh (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_NEQ_UQ,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_UQ,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 #ifdef __OPTIMIZE__
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-  _mm_comi_sh (__m128h __A, __m128h __B, const int __P)
+_mm_comi_sh (__m128h __A, __m128h __B, const int __P)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, __P,
-						(__mmask8) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, __P,
+					  (__mmask8) -1,
+					  _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_comi_round_sh (__m128h __A, __m128h __B, const int __P, const int __R)
 {
-  return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, __P,
-						(__mmask8) -1,__R);
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, __P,
+					  (__mmask8) -1,__R);
 }
 
 #else
-#define _mm_comi_round_sh(A, B, P, R)		\
-  (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (P), (__mmask8) (-1), (R)))
-#define _mm_comi_sh(A, B, P)		\
-  (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (P), (__mmask8) (-1), \
-					  _MM_FROUND_CUR_DIRECTION))
+#define _mm_comi_round_sh(A, B, P, R)					\
+  (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1), (R)))
+#define _mm_comi_sh(A, B, P)						\
+  (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1),	\
+				    _MM_FROUND_CUR_DIRECTION))
 
 #endif /* __OPTIMIZE__  */
 
+/* Intrinsics vsqrtph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_ph (__m512h __A)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__A,
+					      _mm512_setzero_ph(),
+					      (__mmask32) -1,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_ph (__mmask32 __A, __m512h __B)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__B,
+					      _mm512_setzero_ph (),
+					      __A,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_round_ph (__m512h __A, const int __B)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__A,
+					      _mm512_setzero_ph(),
+					      (__mmask32) -1, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			   const int __D)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_round_ph (__mmask32 __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__B,
+					      _mm512_setzero_ph (),
+					      __A, __C);
+}
+
+#else
+#define _mm512_sqrt_round_ph(A, B)				\
+  (__builtin_ia32_sqrtph512_mask_round ((A),			\
+					_mm512_setzero_ph (),	\
+					(__mmask32)-1, (B)))
+
+#define _mm512_mask_sqrt_round_ph(A, B, C, D)			\
+  (__builtin_ia32_sqrtph512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_sqrt_round_ph(A, B, C)			\
+  (__builtin_ia32_sqrtph512_mask_round ((B),			\
+					_mm512_setzero_ph (),	\
+					(A), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrsqrtph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt_ph (__m512h __A)
+{
+  return __builtin_ia32_rsqrtph512_mask (__A, _mm512_setzero_ph (),
+					 (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+  return __builtin_ia32_rsqrtph512_mask (__C, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt_ph (__mmask32 __A, __m512h __B)
+{
+  return __builtin_ia32_rsqrtph512_mask (__B, _mm512_setzero_ph (),
+					 __A);
+}
+
+/* Intrinsics vrsqrtsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_rsqrtsh_mask (__B, __A, _mm_setzero_ph (),
+				      (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_rsqrtsh_mask (__D, __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_rsqrtsh_mask (__C, __B, _mm_setzero_ph (),
+				      __A);
+}
+
+/* Intrinsics vsqrtsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__B, __A,
+					   _mm_setzero_ph (),
+					   (__mmask8) -1,
+					   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B,
+					   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__C, __B,
+					   _mm_setzero_ph (),
+					   __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__B, __A,
+					   _mm_setzero_ph (),
+					   (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+			__m128h __D, const int __E)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B,
+					   __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+			 const int __D)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__C, __B,
+					   _mm_setzero_ph (),
+					   __A, __D);
+}
+
+#else
+#define _mm_sqrt_round_sh(A, B, C)				\
+  (__builtin_ia32_sqrtsh_mask_round ((B), (A),			\
+				     _mm_setzero_ph (),		\
+				     (__mmask8)-1, (C)))
+
+#define _mm_mask_sqrt_round_sh(A, B, C, D, E)			\
+  (__builtin_ia32_sqrtsh_mask_round ((D), (C), (A), (B), (E)))
+
+#define _mm_maskz_sqrt_round_sh(A, B, C, D)		\
+  (__builtin_ia32_sqrtsh_mask_round ((C), (B),		\
+				     _mm_setzero_ph (),	\
+				     (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrcpph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp_ph (__m512h __A)
+{
+  return __builtin_ia32_rcpph512_mask (__A, _mm512_setzero_ph (),
+				       (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+  return __builtin_ia32_rcpph512_mask (__C, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp_ph (__mmask32 __A, __m512h __B)
+{
+  return __builtin_ia32_rcpph512_mask (__B, _mm512_setzero_ph (),
+				       __A);
+}
+
+/* Intrinsics vrcpsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_rcpsh_mask (__B, __A, _mm_setzero_ph (),
+				    (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp_sh (__m128h __A, __mmask32 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_rcpsh_mask (__D, __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp_sh (__mmask32 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_rcpsh_mask (__C, __B, _mm_setzero_ph (),
+				    __A);
+}
+
+/* Intrinsics vscalefph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_ph (__m512h __A, __m512h __B)
+{
+  return __builtin_ia32_scalefph512_mask_round (__A, __B,
+						_mm512_setzero_ph (),
+						(__mmask32) -1,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_scalefph512_mask_round (__B, __C,
+						_mm512_setzero_ph (),
+						__A,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_scalefph512_mask_round (__A, __B,
+						_mm512_setzero_ph (),
+						(__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			     __m512h __D, const int __E)
+{
+  return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B,
+						__E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+			      const int __D)
+{
+  return __builtin_ia32_scalefph512_mask_round (__B, __C,
+						_mm512_setzero_ph (),
+						__A, __D);
+}
+
+#else
+#define _mm512_scalef_round_ph(A, B, C)				\
+  (__builtin_ia32_scalefph512_mask_round ((A), (B),		\
+					  _mm512_setzero_ph (),	\
+					  (__mmask32)-1, (C)))
+
+#define _mm512_mask_scalef_round_ph(A, B, C, D, E)			\
+  (__builtin_ia32_scalefph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_scalef_round_ph(A, B, C, D)		\
+  (__builtin_ia32_scalefph512_mask_round ((B), (C),		\
+					  _mm512_setzero_ph (),	\
+					  (A), (D)))
+
+#endif  /* __OPTIMIZE__ */
+
+/* Intrinsics vscalefsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_scalefsh_mask_round (__A, __B,
+					     _mm_setzero_ph (),
+					     (__mmask8) -1,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_scalefsh_mask_round (__B, __C,
+					     _mm_setzero_ph (),
+					     __A,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+  return __builtin_ia32_scalefsh_mask_round (__A, __B,
+					     _mm_setzero_ph (),
+					     (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+			  __m128h __D, const int __E)
+{
+  return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B,
+					     __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+			   const int __D)
+{
+  return __builtin_ia32_scalefsh_mask_round (__B, __C,
+					     _mm_setzero_ph (),
+					     __A, __D);
+}
+
+#else
+#define _mm_scalef_round_sh(A, B, C)				\
+  (__builtin_ia32_scalefsh_mask_round ((A), (B),		\
+				       _mm_setzero_ph (),	\
+				       (__mmask8)-1, (C)))
+
+#define _mm_mask_scalef_round_sh(A, B, C, D, E)				\
+  (__builtin_ia32_scalefsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_scalef_round_sh(A, B, C, D)				\
+  (__builtin_ia32_scalefsh_mask_round ((B), (C), _mm_setzero_ph (),	\
+				       (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vreduceph.  */
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_ph (__m512h __A, int __B)
+{
+  return __builtin_ia32_reduceph512_mask_round (__A, __B,
+						_mm512_setzero_ph (),
+						(__mmask32) -1,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_ph (__m512h __A, __mmask32 __B, __m512h __C, int __D)
+{
+  return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_ph (__mmask32 __A, __m512h __B, int __C)
+{
+  return __builtin_ia32_reduceph512_mask_round (__B, __C,
+						_mm512_setzero_ph (),
+						__A,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_round_ph (__m512h __A, int __B, const int __C)
+{
+  return __builtin_ia32_reduceph512_mask_round (__A, __B,
+						_mm512_setzero_ph (),
+						(__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+			     int __D, const int __E)
+{
+  return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B,
+						__E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_round_ph (__mmask32 __A, __m512h __B, int __C,
+			      const int __D)
+{
+  return __builtin_ia32_reduceph512_mask_round (__B, __C,
+						_mm512_setzero_ph (),
+						__A, __D);
+}
+
+#else
+#define _mm512_reduce_ph(A, B)						\
+  (__builtin_ia32_reduceph512_mask_round ((A), (B),			\
+					  _mm512_setzero_ph (),		\
+					  (__mmask32)-1,		\
+					  _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_ph(A, B, C, D)				\
+  (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B),		\
+					  _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_reduce_ph(A, B, C)					\
+  (__builtin_ia32_reduceph512_mask_round ((B), (C),			\
+					  _mm512_setzero_ph (),		\
+					  (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_reduce_round_ph(A, B, C)				\
+  (__builtin_ia32_reduceph512_mask_round ((A), (B),		\
+					  _mm512_setzero_ph (),	\
+					  (__mmask32)-1, (C)))
+
+#define _mm512_mask_reduce_round_ph(A, B, C, D, E)			\
+  (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_reduce_round_ph(A, B, C, D)		\
+  (__builtin_ia32_reduceph512_mask_round ((B), (C),		\
+					  _mm512_setzero_ph (),	\
+					  (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vreducesh.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_sh (__m128h __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_reducesh_mask_round (__A, __B, __C,
+					     _mm_setzero_ph (),
+					     (__mmask8) -1,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_sh (__m128h __A, __mmask8 __B, __m128h __C,
+		    __m128h __D, int __E)
+{
+  return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A, __B,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_reducesh_mask_round (__B, __C, __D,
+					     _mm_setzero_ph (), __A,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_round_sh (__m128h __A, __m128h __B, int __C, const int __D)
+{
+  return __builtin_ia32_reducesh_mask_round (__A, __B, __C,
+					     _mm_setzero_ph (),
+					     (__mmask8) -1, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+			  __m128h __D, int __E, const int __F)
+{
+  return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A,
+					     __B, __F);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+			   int __D, const int __E)
+{
+  return __builtin_ia32_reducesh_mask_round (__B, __C, __D,
+					     _mm_setzero_ph (),
+					     __A, __E);
+}
+
+#else
+#define _mm_reduce_sh(A, B, C)						\
+  (__builtin_ia32_reducesh_mask_round ((A), (B), (C),			\
+				       _mm_setzero_ph (),		\
+				       (__mmask8)-1,			\
+				       _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_reduce_sh(A, B, C, D, E)				\
+  (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B),		\
+				       _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_reduce_sh(A, B, C, D)					\
+  (__builtin_ia32_reducesh_mask_round ((B), (C), (D),			\
+				       _mm_setzero_ph (),		\
+				       (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_reduce_round_sh(A, B, C, D)				\
+  (__builtin_ia32_reducesh_mask_round ((A), (B), (C),		\
+				       _mm_setzero_ph (),	\
+				       (__mmask8)-1, (D)))
+
+#define _mm_mask_reduce_round_sh(A, B, C, D, E, F)			\
+  (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B), (F)))
+
+#define _mm_maskz_reduce_round_sh(A, B, C, D, E)		\
+  (__builtin_ia32_reducesh_mask_round ((B), (C), (D),		\
+				       _mm_setzero_ph (),	\
+				       (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscaleph.  */
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_ph (__m512h __A, int __B)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__A, __B,
+						  _mm512_setzero_ph (),
+						  (__mmask32) -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_ph (__m512h __A, __mmask32 __B,
+			   __m512h __C, int __D)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A, __B,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_ph (__mmask32 __A, __m512h __B, int __C)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__B, __C,
+						  _mm512_setzero_ph (),
+						  __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_round_ph (__m512h __A, int __B, const int __C)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__A, __B,
+						  _mm512_setzero_ph (),
+						  (__mmask32) -1,
+						  __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_round_ph (__m512h __A, __mmask32 __B,
+				 __m512h __C, int __D, const int __E)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A,
+						  __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_round_ph (__mmask32 __A, __m512h __B, int __C,
+				  const int __D)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__B, __C,
+						  _mm512_setzero_ph (),
+						  __A, __D);
+}
+
+#else
+#define _mm512_roundscale_ph(A, B)					\
+  (__builtin_ia32_rndscaleph512_mask_round ((A), (B),			\
+					    _mm512_setzero_ph (),	\
+					    (__mmask32)-1,		\
+					    _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_ph(A, B, C, D)				\
+  (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B),		\
+					    _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_roundscale_ph(A, B, C)				\
+  (__builtin_ia32_rndscaleph512_mask_round ((B), (C),			\
+					    _mm512_setzero_ph (),	\
+					    (A),			\
+					    _MM_FROUND_CUR_DIRECTION))
+#define _mm512_roundscale_round_ph(A, B, C)				\
+  (__builtin_ia32_rndscaleph512_mask_round ((A), (B),			\
+					    _mm512_setzero_ph (),	\
+					    (__mmask32)-1, (C)))
+
+#define _mm512_mask_roundscale_round_ph(A, B, C, D, E)			\
+  (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_roundscale_round_ph(A, B, C, D)			\
+  (__builtin_ia32_rndscaleph512_mask_round ((B), (C),			\
+					    _mm512_setzero_ph (),	\
+					    (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscalesh.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_sh (__m128h __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C,
+					       _mm_setzero_ph (),
+					       (__mmask8) -1,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_sh (__m128h __A, __mmask8 __B, __m128h __C,
+			__m128h __D, int __E)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E, __A, __B,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D,
+					       _mm_setzero_ph (), __A,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_round_sh (__m128h __A, __m128h __B, int __C, const int __D)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C,
+					       _mm_setzero_ph (),
+					       (__mmask8) -1,
+					       __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+			      __m128h __D, int __E, const int __F)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E,
+					       __A, __B, __F);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+			       int __D, const int __E)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D,
+					       _mm_setzero_ph (),
+					       __A, __E);
+}
+
+#else
+#define _mm_roundscale_sh(A, B, C)					\
+  (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C),			\
+					 _mm_setzero_ph (),		\
+					 (__mmask8)-1,			\
+					 _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_sh(A, B, C, D, E)				\
+  (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B),	\
+					 _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_roundscale_sh(A, B, C, D)				\
+  (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D),			\
+					 _mm_setzero_ph (),		\
+					 (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_roundscale_round_sh(A, B, C, D)			\
+  (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C),		\
+					 _mm_setzero_ph (),	\
+					 (__mmask8)-1, (D)))
+
+#define _mm_mask_roundscale_round_sh(A, B, C, D, E, F)			\
+  (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B), (F)))
+
+#define _mm_maskz_roundscale_round_sh(A, B, C, D, E)		\
+  (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D),		\
+					 _mm_setzero_ph (),	\
+					 (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclasssh.  */
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_sh_mask (__m128h __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm,
+						   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_sh_mask (__mmask8 __U, __m128h __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm, __U);
+}
+
+#else
+#define _mm_fpclass_sh_mask(X, C)					\
+  ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X),	\
+					     (int) (C), (__mmask8) (-1))) \
+
+#define _mm_mask_fpclass_sh_mask(U, X, C)				\
+  ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X),	\
+					     (int) (C), (__mmask8) (U)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclassph.  */
+#ifdef __OPTIMIZE__
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fpclass_ph_mask (__mmask32 __U, __m512h __A,
+			     const int __imm)
+{
+  return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A,
+						       __imm, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fpclass_ph_mask (__m512h __A, const int __imm)
+{
+  return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A,
+						       __imm,
+						       (__mmask32) -1);
+}
+
+#else
+#define _mm512_mask_fpclass_ph_mask(u, x, c)				\
+  ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
+						 (int) (c),(__mmask8)(u)))
+
+#define _mm512_fpclass_ph_mask(x, c)                                    \
+  ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
+						 (int) (c),(__mmask8)-1))
+#endif /* __OPIMTIZE__ */
+
+/* Intrinsics vgetexpph, vgetexpsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_sh (__m128h __A, __m128h __B)
+{
+  return (__m128h)
+    __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+					(__v8hf) _mm_setzero_ph (),
+					(__mmask8) -1,
+					_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+  return (__m128h)
+    __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+					(__v8hf) __W, (__mmask8) __U,
+					_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_sh (__mmask8 __U, __m128h __A, __m128h __B)
+{
+  return (__m128h)
+    __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+					(__v8hf) _mm_setzero_ph (),
+					(__mmask8) __U,
+					_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_ph (__m512h __A)
+{
+  return (__m512h)
+    __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+				     (__v32hf) _mm512_setzero_ph (),
+				     (__mmask32) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_ph (__m512h __W, __mmask32 __U, __m512h __A)
+{
+  return (__m512h)
+    __builtin_ia32_getexpph512_mask ((__v32hf) __A, (__v32hf) __W,
+				     (__mmask32) __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_ph (__mmask32 __U, __m512h __A)
+{
+  return (__m512h)
+    __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+				     (__v32hf) _mm512_setzero_ph (),
+				     (__mmask32) __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_round_sh (__m128h __A, __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+						       (__v8hf) __B,
+						       _mm_setzero_ph (),
+						       (__mmask8) -1,
+						       __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_round_sh (__m128h __W, __mmask8 __U, __m128h __A,
+			  __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+						       (__v8hf) __B,
+						       (__v8hf) __W,
+						       (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_round_sh (__mmask8 __U, __m128h __A, __m128h __B,
+			   const int __R)
+{
+  return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+						       (__v8hf) __B,
+						       (__v8hf)
+						       _mm_setzero_ph (),
+						       (__mmask8) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_round_ph (__m512h __A, const int __R)
+{
+  return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+						    (__v32hf)
+						    _mm512_setzero_ph (),
+						    (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_round_ph (__m512h __W, __mmask32 __U, __m512h __A,
+			     const int __R)
+{
+  return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+						    (__v32hf) __W,
+						    (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_round_ph (__mmask32 __U, __m512h __A, const int __R)
+{
+  return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+						    (__v32hf)
+						    _mm512_setzero_ph (),
+						    (__mmask32) __U, __R);
+}
+
+#else
+#define _mm_getexp_round_sh(A, B, R)					\
+  ((__m128h)__builtin_ia32_getexpsh_mask_round((__v8hf)(__m128h)(A),	\
+					       (__v8hf)(__m128h)(B),	\
+					       (__v8hf)_mm_setzero_ph(), \
+					       (__mmask8)-1, R))
+
+#define _mm_mask_getexp_round_sh(W, U, A, B, C)			\
+  (__m128h)__builtin_ia32_getexpsh_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_getexp_round_sh(U, A, B, C)				\
+  (__m128h)__builtin_ia32_getexpsh_mask_round(A, B,			\
+					      (__v8hf)_mm_setzero_ph(),	\
+					      U, C)
+
+#define _mm512_getexp_round_ph(A, R)					\
+  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),	\
+					    (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, R))
+
+#define _mm512_mask_getexp_round_ph(W, U, A, R)				\
+  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),	\
+					    (__v32hf)(__m512h)(W), (__mmask32)(U), R))
+
+#define _mm512_maskz_getexp_round_ph(U, A, R)				\
+  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),	\
+					    (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), R))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vgetmantph, vgetmantsh.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_sh (__m128h __A, __m128h __B,
+		_MM_MANTISSA_NORM_ENUM __C,
+		_MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128h)
+    __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+					 (__D << 2) | __C, _mm_setzero_ph (),
+					 (__mmask8) -1,
+					 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_sh (__m128h __W, __mmask8 __U, __m128h __A,
+		     __m128h __B, _MM_MANTISSA_NORM_ENUM __C,
+		     _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128h)
+    __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+					 (__D << 2) | __C, (__v8hf) __W,
+					 __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_sh (__mmask8 __U, __m128h __A, __m128h __B,
+		      _MM_MANTISSA_NORM_ENUM __C,
+		      _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128h)
+    __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+					 (__D << 2) | __C,
+					 (__v8hf) _mm_setzero_ph(),
+					 __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B,
+		   _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+						     (__C << 2) | __B,
+						     _mm512_setzero_ph (),
+						     (__mmask32) -1,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_ph (__m512h __W, __mmask32 __U, __m512h __A,
+			_MM_MANTISSA_NORM_ENUM __B,
+			_MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+						     (__C << 2) | __B,
+						     (__v32hf) __W, __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_ph (__mmask32 __U, __m512h __A,
+			 _MM_MANTISSA_NORM_ENUM __B,
+			 _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+						     (__C << 2) | __B,
+						     (__v32hf)
+						     _mm512_setzero_ph (),
+						     __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_round_sh (__m128h __A, __m128h __B,
+		      _MM_MANTISSA_NORM_ENUM __C,
+		      _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+							(__v8hf) __B,
+							(__D << 2) | __C,
+							_mm_setzero_ph (),
+							(__mmask8) -1,
+							__R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_round_sh (__m128h __W, __mmask8 __U, __m128h __A,
+			   __m128h __B, _MM_MANTISSA_NORM_ENUM __C,
+			   _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+							(__v8hf) __B,
+							(__D << 2) | __C,
+							(__v8hf) __W,
+							__U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_round_sh (__mmask8 __U, __m128h __A, __m128h __B,
+			    _MM_MANTISSA_NORM_ENUM __C,
+			    _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+							(__v8hf) __B,
+							(__D << 2) | __C,
+							(__v8hf)
+							_mm_setzero_ph(),
+							__U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_round_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B,
+			 _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+						     (__C << 2) | __B,
+						     _mm512_setzero_ph (),
+						     (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_round_ph (__m512h __W, __mmask32 __U, __m512h __A,
+			      _MM_MANTISSA_NORM_ENUM __B,
+			      _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+						     (__C << 2) | __B,
+						     (__v32hf) __W, __U,
+						     __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_round_ph (__mmask32 __U, __m512h __A,
+			       _MM_MANTISSA_NORM_ENUM __B,
+			       _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+						     (__C << 2) | __B,
+						     (__v32hf)
+						     _mm512_setzero_ph (),
+						     __U, __R);
+}
+
+#else
+#define _mm512_getmant_ph(X, B, C)					\
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),	\
+					      (int)(((C)<<2) | (B)),	\
+					      (__v32hf)(__m512h)	\
+					      _mm512_setzero_ph(),	\
+					      (__mmask32)-1,		\
+					      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ph(W, U, X, B, C)				\
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),	\
+					      (int)(((C)<<2) | (B)),	\
+					      (__v32hf)(__m512h)(W),	\
+					      (__mmask32)(U),		\
+					      _MM_FROUND_CUR_DIRECTION))
+
+
+#define _mm512_maskz_getmant_ph(U, X, B, C)				\
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),	\
+					      (int)(((C)<<2) | (B)),	\
+					      (__v32hf)(__m512h)	\
+					      _mm512_setzero_ph(),	\
+					      (__mmask32)(U),		\
+					      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getmant_sh(X, Y, C, D)					\
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X),	\
+						 (__v8hf)(__m128h)(Y),	\
+						 (int)(((D)<<2) | (C)),	\
+						 (__v8hf)(__m128h)	\
+						 _mm_setzero_ph (),	\
+						 (__mmask8)-1,		\
+						 _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_sh(W, U, X, Y, C, D)				\
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X),	\
+						 (__v8hf)(__m128h)(Y),	\
+						 (int)(((D)<<2) | (C)),	\
+						 (__v8hf)(__m128h)(W),	\
+						 (__mmask8)(U),		\
+						 _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_sh(U, X, Y, C, D)				\
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X),	\
+						 (__v8hf)(__m128h)(Y),	\
+						 (int)(((D)<<2) | (C)),	\
+						 (__v8hf)(__m128h)	\
+						 _mm_setzero_ph(),	\
+						 (__mmask8)(U),		\
+						 _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getmant_round_ph(X, B, C, R)				\
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),	\
+					      (int)(((C)<<2) | (B)),	\
+					      (__v32hf)(__m512h)	\
+					      _mm512_setzero_ph(),	\
+					      (__mmask32)-1,		\
+					      (R)))
+
+#define _mm512_mask_getmant_round_ph(W, U, X, B, C, R)			\
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),	\
+					      (int)(((C)<<2) | (B)),	\
+					      (__v32hf)(__m512h)(W),	\
+					      (__mmask32)(U),		\
+					      (R)))
+
+
+#define _mm512_maskz_getmant_round_ph(U, X, B, C, R)			\
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),	\
+					      (int)(((C)<<2) | (B)),	\
+					      (__v32hf)(__m512h)	\
+					      _mm512_setzero_ph(),	\
+					      (__mmask32)(U),		\
+					      (R)))
+
+#define _mm_getmant_round_sh(X, Y, C, D, R)				\
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X),	\
+						 (__v8hf)(__m128h)(Y),	\
+						 (int)(((D)<<2) | (C)),	\
+						 (__v8hf)(__m128h)	\
+						 _mm_setzero_ph (),	\
+						 (__mmask8)-1,		\
+						 (R)))
+
+#define _mm_mask_getmant_round_sh(W, U, X, Y, C, D, R)			\
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X),	\
+						 (__v8hf)(__m128h)(Y),	\
+						 (int)(((D)<<2) | (C)),	\
+						 (__v8hf)(__m128h)(W),	\
+						 (__mmask8)(U),		\
+						 (R)))
+
+#define _mm_maskz_getmant_round_sh(U, X, Y, C, D, R)			\
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X),	\
+						 (__v8hf)(__m128h)(Y),	\
+						 (int)(((D)<<2) | (C)),	\
+						 (__v8hf)(__m128h)	\
+						 _mm_setzero_ph(),	\
+						 (__mmask8)(U),		\
+						 (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vmovw.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi16_si128 (short __A)
+{
+  return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si16 (__m128i __A)
+{
+  return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, 0);
+}
+
+/* Intrinsics vmovsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_sh (__m128h __A, __mmask8 __B, _Float16 const* __C)
+{
+  return __builtin_ia32_loadsh_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_sh (__mmask8 __A, _Float16 const* __B)
+{
+  return __builtin_ia32_loadsh_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C)
+{
+  __builtin_ia32_storesh_mask (__A,  __C, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sh (__m128h __A, __m128h  __B)
+{
+  __A[0] = __B[0];
+  return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_sh (__m128h __A, __mmask8 __B, __m128h  __C, __m128h __D)
+{
+  return __builtin_ia32_vmovsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_sh (__mmask8 __A, __m128h  __B, __m128h __C)
+{
+  return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A);
+}
+
+/* Intrinsics vcvtph2dq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi32 (__m256h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__A,
+					    (__v16si)
+					    _mm512_setzero_si512 (),
+					    (__mmask16) -1,
+					    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__C,
+					    (__v16si) __A,
+					    __B,
+					    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi32 (__mmask16 __A, __m256h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__B,
+					    (__v16si)
+					    _mm512_setzero_si512 (),
+					    __A,
+					    _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi32 (__m256h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__A,
+					    (__v16si)
+					    _mm512_setzero_si512 (),
+					    (__mmask16) -1,
+					    __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi32 (__m512i __A, __mmask16 __B, __m256h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__C,
+					    (__v16si) __A,
+					    __B,
+					    __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__B,
+					    (__v16si)
+					    _mm512_setzero_si512 (),
+					    __A,
+					    __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi32(A, B)					\
+  ((__m512i)								\
+   __builtin_ia32_vcvtph2dq512_mask_round ((A),				\
+					   (__v16si)			\
+					   _mm512_setzero_si512 (),	\
+					   (__mmask16)-1,		\
+					   (B)))
+
+#define _mm512_mask_cvt_roundph_epi32(A, B, C, D)			\
+  ((__m512i)								\
+   __builtin_ia32_vcvtph2dq512_mask_round ((C), (__v16si)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epi32(A, B, C)				\
+  ((__m512i)								\
+   __builtin_ia32_vcvtph2dq512_mask_round ((B),				\
+					   (__v16si)			\
+					   _mm512_setzero_si512 (),	\
+					   (A),				\
+					   (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2udq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu32 (__m256h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__A,
+					     (__v16si)
+					     _mm512_setzero_si512 (),
+					     (__mmask16) -1,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__C,
+					     (__v16si) __A,
+					     __B,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu32 (__mmask16 __A, __m256h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__B,
+					     (__v16si)
+					     _mm512_setzero_si512 (),
+					     __A,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu32 (__m256h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__A,
+					     (__v16si)
+					     _mm512_setzero_si512 (),
+					     (__mmask16) -1,
+					     __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu32 (__m512i __A, __mmask16 __B, __m256h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__C,
+					     (__v16si) __A,
+					     __B,
+					     __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__B,
+					     (__v16si)
+					     _mm512_setzero_si512 (),
+					     __A,
+					     __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu32(A, B)					\
+  ((__m512i)								\
+   __builtin_ia32_vcvtph2udq512_mask_round ((A),			\
+					    (__v16si)			\
+					    _mm512_setzero_si512 (),	\
+					    (__mmask16)-1,		\
+					    (B)))
+
+#define _mm512_mask_cvt_roundph_epu32(A, B, C, D)			\
+  ((__m512i)								\
+   __builtin_ia32_vcvtph2udq512_mask_round ((C), (__v16si)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu32(A, B, C)				\
+  ((__m512i)								\
+   __builtin_ia32_vcvtph2udq512_mask_round ((B),			\
+					    (__v16si)			\
+					    _mm512_setzero_si512 (),	\
+					    (A),			\
+					    (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2dq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi32 (__m256h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__A,
+					     (__v16si)
+					     _mm512_setzero_si512 (),
+					     (__mmask16) -1,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__C,
+					     (__v16si) __A,
+					     __B,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi32 (__mmask16 __A, __m256h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__B,
+					     (__v16si)
+					     _mm512_setzero_si512 (),
+					     __A,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi32 (__m256h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__A,
+					     (__v16si)
+					     _mm512_setzero_si512 (),
+					     (__mmask16) -1,
+					     __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi32 (__m512i __A, __mmask16 __B,
+				__m256h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__C,
+					     (__v16si) __A,
+					     __B,
+					     __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__B,
+					     (__v16si)
+					     _mm512_setzero_si512 (),
+					     __A,
+					     __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi32(A, B)					\
+  ((__m512i)								\
+   __builtin_ia32_vcvttph2dq512_mask_round ((A),			\
+					    (__v16si)			\
+					    (_mm512_setzero_si512 ()),	\
+					    (__mmask16)(-1), (B)))
+
+#define _mm512_mask_cvtt_roundph_epi32(A, B, C, D)		\
+  ((__m512i)							\
+   __builtin_ia32_vcvttph2dq512_mask_round ((C),		\
+					    (__v16si)(A),	\
+					    (B),		\
+					    (D)))
+
+#define _mm512_maskz_cvtt_roundph_epi32(A, B, C)			\
+  ((__m512i)								\
+   __builtin_ia32_vcvttph2dq512_mask_round ((B),			\
+					    (__v16si)			\
+					    _mm512_setzero_si512 (),	\
+					    (A),			\
+					    (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2udq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu32 (__m256h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__A,
+					      (__v16si)
+					      _mm512_setzero_si512 (),
+					      (__mmask16) -1,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__C,
+					      (__v16si) __A,
+					      __B,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu32 (__mmask16 __A, __m256h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__B,
+					      (__v16si)
+					      _mm512_setzero_si512 (),
+					      __A,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu32 (__m256h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__A,
+					      (__v16si)
+					      _mm512_setzero_si512 (),
+					      (__mmask16) -1,
+					      __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu32 (__m512i __A, __mmask16 __B,
+				__m256h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__C,
+					      (__v16si) __A,
+					      __B,
+					      __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__B,
+					      (__v16si)
+					      _mm512_setzero_si512 (),
+					      __A,
+					      __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu32(A, B)					\
+  ((__m512i)								\
+   __builtin_ia32_vcvttph2udq512_mask_round ((A),			\
+					     (__v16si)			\
+					     _mm512_setzero_si512 (),	\
+					     (__mmask16)-1,		\
+					     (B)))
+
+#define _mm512_mask_cvtt_roundph_epu32(A, B, C, D)		\
+  ((__m512i)							\
+   __builtin_ia32_vcvttph2udq512_mask_round ((C),		\
+					     (__v16si)(A),	\
+					     (B),		\
+					     (D)))
+
+#define _mm512_maskz_cvtt_roundph_epu32(A, B, C)			\
+  ((__m512i)								\
+   __builtin_ia32_vcvttph2udq512_mask_round ((B),			\
+					     (__v16si)			\
+					     _mm512_setzero_si512 (),	\
+					     (A),			\
+					     (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtdq2ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_ph (__m512i __A)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A,
+						 _mm256_setzero_ph (),
+						 (__mmask16) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_ph (__m256h __A, __mmask16 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C,
+						 __A,
+						 __B,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_ph (__mmask16 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B,
+						 _mm256_setzero_ph (),
+						 __A,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi32_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A,
+						 _mm256_setzero_ph (),
+						 (__mmask16) -1,
+						 __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C,
+						 __A,
+						 __B,
+						 __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi32_ph (__mmask16 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B,
+						 _mm256_setzero_ph (),
+						 __A,
+						 __C);
+}
+
+#else
+#define _mm512_cvt_roundepi32_ph(A, B)					\
+  (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(A),		\
+					   _mm256_setzero_ph (),	\
+					   (__mmask16)-1,		\
+					   (B)))
+
+#define _mm512_mask_cvt_roundepi32_ph(A, B, C, D)		\
+  (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(C),	\
+					   (A),			\
+					   (B),			\
+					   (D)))
+
+#define _mm512_maskz_cvt_roundepi32_ph(A, B, C)				\
+  (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(B),		\
+					   _mm256_setzero_ph (),	\
+					   (A),				\
+					   (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtudq2ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_ph (__m512i __A)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A,
+						  _mm256_setzero_ph (),
+						  (__mmask16) -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_ph (__m256h __A, __mmask16 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C,
+						  __A,
+						  __B,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_ph (__mmask16 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B,
+						  _mm256_setzero_ph (),
+						  __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu32_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A,
+						  _mm256_setzero_ph (),
+						  (__mmask16) -1,
+						  __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C,
+						  __A,
+						  __B,
+						  __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu32_ph (__mmask16 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B,
+						  _mm256_setzero_ph (),
+						  __A,
+						  __C);
+}
+
+#else
+#define _mm512_cvt_roundepu32_ph(A, B)					\
+  (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)(A),		\
+					    _mm256_setzero_ph (),	\
+					    (__mmask16)-1,		\
+					    B))
+
+#define _mm512_mask_cvt_roundepu32_ph(A, B, C, D)	\
+  (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)C,	\
+					    A,		\
+					    B,		\
+					    D))
+
+#define _mm512_maskz_cvt_roundepu32_ph(A, B, C)				\
+  (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)B,			\
+					    _mm256_setzero_ph (),	\
+					    A,				\
+					    C))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2qq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi64 (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__A,
+						 _mm512_setzero_si512 (),
+						 (__mmask8) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__B,
+						 _mm512_setzero_si512 (),
+						 __A,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi64 (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__A,
+						 _mm512_setzero_si512 (),
+						 (__mmask8) -1,
+						 __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__B,
+						 _mm512_setzero_si512 (),
+						 __A,
+						 __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi64(A, B)					\
+  (__builtin_ia32_vcvtph2qq512_mask_round ((A),				\
+					   _mm512_setzero_si512 (),	\
+					   (__mmask8)-1,		\
+					   (B)))
+
+#define _mm512_mask_cvt_roundph_epi64(A, B, C, D)		\
+  (__builtin_ia32_vcvtph2qq512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epi64(A, B, C)				\
+  (__builtin_ia32_vcvtph2qq512_mask_round ((B),				\
+					   _mm512_setzero_si512 (),	\
+					   (A),				\
+					   (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2uqq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__A,
+						  _mm512_setzero_si512 (),
+						  (__mmask8) -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__B,
+						  _mm512_setzero_si512 (),
+						  __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu64 (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__A,
+						  _mm512_setzero_si512 (),
+						  (__mmask8) -1,
+						  __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__B,
+						  _mm512_setzero_si512 (),
+						  __A,
+						  __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu64(A, B)					\
+  (__builtin_ia32_vcvtph2uqq512_mask_round ((A),			\
+					    _mm512_setzero_si512 (),	\
+					    (__mmask8)-1,		\
+					    (B)))
+
+#define _mm512_mask_cvt_roundph_epu64(A, B, C, D)			\
+  (__builtin_ia32_vcvtph2uqq512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu64(A, B, C)				\
+  (__builtin_ia32_vcvtph2uqq512_mask_round ((B),			\
+					    _mm512_setzero_si512 (),	\
+					    (A),			\
+					    (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2qq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__A,
+						  _mm512_setzero_si512 (),
+						  (__mmask8) -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__B,
+						  _mm512_setzero_si512 (),
+						  __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi64 (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__A,
+						  _mm512_setzero_si512 (),
+						  (__mmask8) -1,
+						  __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__B,
+						  _mm512_setzero_si512 (),
+						  __A,
+						  __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi64(A, B)					\
+  (__builtin_ia32_vcvttph2qq512_mask_round ((A),			\
+					    _mm512_setzero_si512 (),	\
+					    (__mmask8)-1,		\
+					    (B)))
+
+#define _mm512_mask_cvtt_roundph_epi64(A, B, C, D)			\
+  __builtin_ia32_vcvttph2qq512_mask_round ((C), (A), (B), (D))
+
+#define _mm512_maskz_cvtt_roundph_epi64(A, B, C)			\
+  (__builtin_ia32_vcvttph2qq512_mask_round ((B),			\
+					    _mm512_setzero_si512 (),	\
+					    (A),			\
+					    (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2uqq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__A,
+						   _mm512_setzero_si512 (),
+						   (__mmask8) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__B,
+						   _mm512_setzero_si512 (),
+						   __A,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu64 (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__A,
+						   _mm512_setzero_si512 (),
+						   (__mmask8) -1,
+						   __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__B,
+						   _mm512_setzero_si512 (),
+						   __A,
+						   __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu64(A, B)					\
+  (__builtin_ia32_vcvttph2uqq512_mask_round ((A),			\
+					     _mm512_setzero_si512 (),	\
+					     (__mmask8)-1,		\
+					     (B)))
+
+#define _mm512_mask_cvtt_roundph_epu64(A, B, C, D)			\
+  __builtin_ia32_vcvttph2uqq512_mask_round ((C), (A), (B), (D))
+
+#define _mm512_maskz_cvtt_roundph_epu64(A, B, C)			\
+  (__builtin_ia32_vcvttph2uqq512_mask_round ((B),			\
+					     _mm512_setzero_si512 (),	\
+					     (A),			\
+					     (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtqq2ph.  */
+extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_ph (__m512i __A)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A,
+						 _mm_setzero_ph (),
+						 (__mmask8) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C,
+						 __A,
+						 __B,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_ph (__mmask8 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B,
+						 _mm_setzero_ph (),
+						 __A,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi64_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A,
+						 _mm_setzero_ph (),
+						 (__mmask8) -1,
+						 __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C,
+						 __A,
+						 __B,
+						 __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi64_ph (__mmask8 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B,
+						 _mm_setzero_ph (),
+						 __A,
+						 __C);
+}
+
+#else
+#define _mm512_cvt_roundepi64_ph(A, B)				\
+  (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(A),		\
+					   _mm_setzero_ph (),	\
+					   (__mmask8)-1,	\
+					   (B)))
+
+#define _mm512_mask_cvt_roundepi64_ph(A, B, C, D)			\
+  (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundepi64_ph(A, B, C)			\
+  (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(B),		\
+					   _mm_setzero_ph (),	\
+					   (A),			\
+					   (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtuqq2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu64_ph (__m512i __A)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A,
+						  _mm_setzero_ph (),
+						  (__mmask8) -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C,
+						  __A,
+						  __B,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu64_ph (__mmask8 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B,
+						  _mm_setzero_ph (),
+						  __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu64_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A,
+						  _mm_setzero_ph (),
+						  (__mmask8) -1,
+						  __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C,
+						  __A,
+						  __B,
+						  __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu64_ph (__mmask8 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B,
+						  _mm_setzero_ph (),
+						  __A,
+						  __C);
+}
+
+#else
+#define _mm512_cvt_roundepu64_ph(A, B)				\
+  (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(A),	\
+					    _mm_setzero_ph (),	\
+					    (__mmask8)-1,	\
+					    (B)))
+
+#define _mm512_mask_cvt_roundepu64_ph(A, B, C, D)			\
+  (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundepu64_ph(A, B, C)			\
+  (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(B),	\
+					    _mm_setzero_ph (),	\
+					    (A),		\
+					    (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2w.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi16 (__m512h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__A,
+					      (__v32hi)
+					      _mm512_setzero_si512 (),
+					      (__mmask32) -1,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__C,
+					      (__v32hi) __A,
+					      __B,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi16 (__mmask32 __A, __m512h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__B,
+					      (__v32hi)
+					      _mm512_setzero_si512 (),
+					      __A,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi16 (__m512h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__A,
+					      (__v32hi)
+					      _mm512_setzero_si512 (),
+					      (__mmask32) -1,
+					      __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi16 (__m512i __A, __mmask32 __B, __m512h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__C,
+					      (__v32hi) __A,
+					      __B,
+					      __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__B,
+					      (__v32hi)
+					      _mm512_setzero_si512 (),
+					      __A,
+					      __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi16(A, B)					\
+  ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((A),		\
+						      (__v32hi)		\
+						      _mm512_setzero_si512 (), \
+						      (__mmask32)-1,	\
+						      (B)))
+
+#define _mm512_mask_cvt_roundph_epi16(A, B, C, D)			\
+  ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((C),		\
+						      (__v32hi)(A),	\
+						      (B),		\
+						      (D)))
+
+#define _mm512_maskz_cvt_roundph_epi16(A, B, C)				\
+  ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((B),		\
+						      (__v32hi)		\
+						      _mm512_setzero_si512 (), \
+						      (A),		\
+						      (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2uw.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu16 (__m512h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__A,
+					       (__v32hi)
+					       _mm512_setzero_si512 (),
+					       (__mmask32) -1,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu16 (__mmask32 __A, __m512h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__B,
+					       (__v32hi)
+					       _mm512_setzero_si512 (),
+					       __A,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu16 (__m512h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__A,
+					       (__v32hi)
+					       _mm512_setzero_si512 (),
+					       (__mmask32) -1,
+					       __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu16 (__m512i __A, __mmask32 __B, __m512h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__B,
+					       (__v32hi)
+					       _mm512_setzero_si512 (),
+					       __A,
+					       __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu16(A, B)					\
+  ((__m512i)								\
+   __builtin_ia32_vcvtph2uw512_mask_round ((A),			\
+					      (__v32hi)			\
+					      _mm512_setzero_si512 (),	\
+					      (__mmask32)-1, (B)))
+
+#define _mm512_mask_cvt_roundph_epu16(A, B, C, D)			\
+  ((__m512i)								\
+   __builtin_ia32_vcvtph2uw512_mask_round ((C), (__v32hi)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu16(A, B, C)				\
+  ((__m512i)								\
+   __builtin_ia32_vcvtph2uw512_mask_round ((B),			\
+					      (__v32hi)			\
+					      _mm512_setzero_si512 (),	\
+					      (A),			\
+					      (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2w.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi16 (__m512h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__A,
+					    (__v32hi)
+					    _mm512_setzero_si512 (),
+					    (__mmask32) -1,
+					    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__C,
+					    (__v32hi) __A,
+					    __B,
+					    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi16 (__mmask32 __A, __m512h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__B,
+					    (__v32hi)
+					    _mm512_setzero_si512 (),
+					    __A,
+					    _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi16 (__m512h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__A,
+					    (__v32hi)
+					    _mm512_setzero_si512 (),
+					    (__mmask32) -1,
+					    __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi16 (__m512i __A, __mmask32 __B,
+				__m512h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__C,
+					    (__v32hi) __A,
+					    __B,
+					    __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__B,
+					    (__v32hi)
+					    _mm512_setzero_si512 (),
+					    __A,
+					    __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi16(A, B)				    \
+  ((__m512i)							    \
+   __builtin_ia32_vcvttph2w512_mask_round ((A),			    \
+					   (__v32hi)		    \
+					   _mm512_setzero_si512 (), \
+					   (__mmask32)-1,	    \
+					   (B)))
+
+#define _mm512_mask_cvtt_roundph_epi16(A, B, C, D)		\
+  ((__m512i)							\
+   __builtin_ia32_vcvttph2w512_mask_round ((C),			\
+					   (__v32hi)(A),	\
+					   (B),			\
+					   (D)))
+
+#define _mm512_maskz_cvtt_roundph_epi16(A, B, C)		    \
+  ((__m512i)							    \
+   __builtin_ia32_vcvttph2w512_mask_round ((B),			    \
+					   (__v32hi)		    \
+					   _mm512_setzero_si512 (), \
+					   (A),			    \
+					   (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2uw.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu16 (__m512h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__A,
+					     (__v32hi)
+					     _mm512_setzero_si512 (),
+					     (__mmask32) -1,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__C,
+					     (__v32hi) __A,
+					     __B,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu16 (__mmask32 __A, __m512h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__B,
+					     (__v32hi)
+					     _mm512_setzero_si512 (),
+					     __A,
+					     _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu16 (__m512h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__A,
+					     (__v32hi)
+					     _mm512_setzero_si512 (),
+					     (__mmask32) -1,
+					     __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu16 (__m512i __A, __mmask32 __B,
+				__m512h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__C,
+					     (__v32hi) __A,
+					     __B,
+					     __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__B,
+					     (__v32hi)
+					     _mm512_setzero_si512 (),
+					     __A,
+					     __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu16(A, B)				     \
+  ((__m512i)							     \
+   __builtin_ia32_vcvttph2uw512_mask_round ((A),		     \
+					    (__v32hi)		     \
+					    _mm512_setzero_si512 (), \
+					    (__mmask32)-1,	     \
+					    (B)))
+
+#define _mm512_mask_cvtt_roundph_epu16(A, B, C, D)		\
+  ((__m512i)							\
+   __builtin_ia32_vcvttph2uw512_mask_round ((C),		\
+					    (__v32hi)(A),	\
+					    (B),		\
+					    (D)))
+
+#define _mm512_maskz_cvtt_roundph_epu16(A, B, C)		     \
+  ((__m512i)							     \
+   __builtin_ia32_vcvttph2uw512_mask_round ((B),		     \
+					    (__v32hi)		     \
+					    _mm512_setzero_si512 (), \
+					    (A),		     \
+					    (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtw2ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_ph (__m512i __A)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A,
+						_mm512_setzero_ph (),
+						(__mmask32) -1,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_ph (__m512h __A, __mmask32 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C,
+						__A,
+						__B,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_ph (__mmask32 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B,
+						_mm512_setzero_ph (),
+						__A,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi16_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A,
+						_mm512_setzero_ph (),
+						(__mmask32) -1,
+						__B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C,
+						__A,
+						__B,
+						__D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi16_ph (__mmask32 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B,
+						_mm512_setzero_ph (),
+						__A,
+						__C);
+}
+
+#else
+#define _mm512_cvt_roundepi16_ph(A, B)				\
+  (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(A),		\
+					  _mm512_setzero_ph (),	\
+					  (__mmask32)-1,	\
+					  (B)))
+
+#define _mm512_mask_cvt_roundepi16_ph(A, B, C, D)	\
+  (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(C),	\
+					  (A),		\
+					  (B),		\
+					  (D)))
+
+#define _mm512_maskz_cvt_roundepi16_ph(A, B, C)			\
+  (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(B),		\
+					  _mm512_setzero_ph (),	\
+					  (A),			\
+					  (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtuw2ph.  */
+  extern __inline __m512h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+  _mm512_cvtepu16_ph (__m512i __A)
+  {
+    return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A,
+						   _mm512_setzero_ph (),
+						   (__mmask32) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+  }
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu16_ph (__m512h __A, __mmask32 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C,
+						 __A,
+						 __B,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu16_ph (__mmask32 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B,
+						 _mm512_setzero_ph (),
+						 __A,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu16_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A,
+						 _mm512_setzero_ph (),
+						 (__mmask32) -1,
+						 __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C,
+						 __A,
+						 __B,
+						 __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu16_ph (__mmask32 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B,
+						 _mm512_setzero_ph (),
+						 __A,
+						 __C);
+}
+
+#else
+#define _mm512_cvt_roundepu16_ph(A, B)					\
+  (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(A),		\
+					   _mm512_setzero_ph (),	\
+					   (__mmask32)-1,		\
+					   (B)))
+
+#define _mm512_mask_cvt_roundepu16_ph(A, B, C, D)		\
+  (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(C),	\
+					   (A),			\
+					   (B),			\
+					   (D)))
+
+#define _mm512_maskz_cvt_roundepu16_ph(A, B, C)				\
+  (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(B),		\
+					   _mm512_setzero_ph (),	\
+					   (A),				\
+					   (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtsh2si, vcvtsh2us.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_i32 (__m128h __A)
+{
+  return (int) __builtin_ia32_vcvtsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_u32 (__m128h __A)
+{
+  return (int) __builtin_ia32_vcvtsh2usi32_round (__A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_i32 (__m128h __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtsh2si32_round (__A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_u32 (__m128h __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtsh2usi32_round (__A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_i32(A, B)		\
+  ((int)__builtin_ia32_vcvtsh2si32_round ((A), (B)))
+#define _mm_cvt_roundsh_u32(A, B)		\
+  ((int)__builtin_ia32_vcvtsh2usi32_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_i64 (__m128h __A)
+{
+  return (long long)
+    __builtin_ia32_vcvtsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_u64 (__m128h __A)
+{
+  return (long long)
+    __builtin_ia32_vcvtsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_i64 (__m128h __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtsh2si64_round (__A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_u64 (__m128h __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtsh2usi64_round (__A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_i64(A, B)			\
+  ((long long)__builtin_ia32_vcvtsh2si64_round ((A), (B)))
+#define _mm_cvt_roundsh_u64(A, B)			\
+  ((long long)__builtin_ia32_vcvtsh2usi64_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvttsh2si, vcvttsh2us.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_i32 (__m128h __A)
+{
+  return (int)
+    __builtin_ia32_vcvttsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_u32 (__m128h __A)
+{
+  return (int)
+    __builtin_ia32_vcvttsh2usi32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_i32 (__m128h __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttsh2si32_round (__A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_u32 (__m128h __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttsh2usi32_round (__A, __R);
+}
+
+#else
+#define _mm_cvtt_roundsh_i32(A, B)		\
+  ((int)__builtin_ia32_vcvttsh2si32_round ((A), (B)))
+#define _mm_cvtt_roundsh_u32(A, B)		\
+  ((int)__builtin_ia32_vcvttsh2usi32_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_i64 (__m128h __A)
+{
+  return (long long)
+    __builtin_ia32_vcvttsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_u64 (__m128h __A)
+{
+  return (long long)
+    __builtin_ia32_vcvttsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_i64 (__m128h __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttsh2si64_round (__A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_u64 (__m128h __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttsh2usi64_round (__A, __R);
+}
+
+#else
+#define _mm_cvtt_roundsh_i64(A, B)			\
+  ((long long)__builtin_ia32_vcvttsh2si64_round ((A), (B)))
+#define _mm_cvtt_roundsh_u64(A, B)			\
+  ((long long)__builtin_ia32_vcvttsh2usi64_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvtsi2sh, vcvtusi2sh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti32_sh (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvtsi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu32_sh (__m128h __A, unsigned int __B)
+{
+  return __builtin_ia32_vcvtusi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi32_sh (__m128h __A, int __B, const int __R)
+{
+  return __builtin_ia32_vcvtsi2sh32_round (__A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu32_sh (__m128h __A, unsigned int __B, const int __R)
+{
+  return __builtin_ia32_vcvtusi2sh32_round (__A, __B, __R);
+}
+
+#else
+#define _mm_cvt_roundi32_sh(A, B, C)		\
+  (__builtin_ia32_vcvtsi2sh32_round ((A), (B), (C)))
+#define _mm_cvt_roundu32_sh(A, B, C)		\
+  (__builtin_ia32_vcvtusi2sh32_round ((A), (B), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti64_sh (__m128h __A, long long __B)
+{
+  return __builtin_ia32_vcvtsi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu64_sh (__m128h __A, unsigned long long __B)
+{
+  return __builtin_ia32_vcvtusi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi64_sh (__m128h __A, long long __B, const int __R)
+{
+  return __builtin_ia32_vcvtsi2sh64_round (__A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu64_sh (__m128h __A, unsigned long long __B, const int __R)
+{
+  return __builtin_ia32_vcvtusi2sh64_round (__A, __B, __R);
+}
+
+#else
+#define _mm_cvt_roundi64_sh(A, B, C)		\
+  (__builtin_ia32_vcvtsi2sh64_round ((A), (B), (C)))
+#define _mm_cvt_roundu64_sh(A, B, C)		\
+  (__builtin_ia32_vcvtusi2sh64_round ((A), (B), (C)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvtph2pd.  */
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_pd (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__A,
+						 _mm512_setzero_pd (),
+						 (__mmask8) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_pd (__m512d __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__B,
+						 _mm512_setzero_pd (),
+						 __A,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_pd (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__A,
+						 _mm512_setzero_pd (),
+						 (__mmask8) -1,
+						 __B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_pd (__m512d __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_pd (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__B,
+						 _mm512_setzero_pd (),
+						 __A,
+						 __C);
+}
+
+#else
+#define _mm512_cvt_roundph_pd(A, B)					\
+  (__builtin_ia32_vcvtph2pd512_mask_round ((A),			\
+					   _mm512_setzero_pd (),	\
+					   (__mmask8)-1,		\
+					   (B)))
+
+#define _mm512_mask_cvt_roundph_pd(A, B, C, D)				\
+  (__builtin_ia32_vcvtph2pd512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_pd(A, B, C)				\
+  (__builtin_ia32_vcvtph2pd512_mask_round ((B),			\
+					   _mm512_setzero_pd (),	\
+					   (A),			\
+					   (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2psx.  */
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtxph_ps (__m256h __A)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__A,
+						  _mm512_setzero_ps (),
+						  (__mmask16) -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtxph_ps (__m512 __A, __mmask16 __B, __m256h __C)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtxph_ps (__mmask16 __A, __m256h __B)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__B,
+						  _mm512_setzero_ps (),
+						  __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtx_roundph_ps (__m256h __A, int __B)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__A,
+						  _mm512_setzero_ps (),
+						  (__mmask16) -1,
+						  __B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtx_roundph_ps (__m512 __A, __mmask16 __B, __m256h __C, int __D)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtx_roundph_ps (__mmask16 __A, __m256h __B, int __C)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__B,
+						  _mm512_setzero_ps (),
+						  __A,
+						  __C);
+}
+
+#else
+#define _mm512_cvtx_roundph_ps(A, B)					\
+  (__builtin_ia32_vcvtph2psx512_mask_round ((A),			\
+					    _mm512_setzero_ps (),	\
+					    (__mmask16)-1,		\
+					    (B)))
+
+#define _mm512_mask_cvtx_roundph_ps(A, B, C, D)				\
+  (__builtin_ia32_vcvtph2psx512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvtx_roundph_ps(A, B, C)				\
+  (__builtin_ia32_vcvtph2psx512_mask_round ((B),			\
+					    _mm512_setzero_ps (),	\
+					    (A),			\
+					    (C)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtps2ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtxps_ph (__m512 __A)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A,
+						  _mm256_setzero_ph (),
+						  (__mmask16) -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtxps_ph (__m256h __A, __mmask16 __B, __m512 __C)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C,
+						  __A, __B,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtxps_ph (__mmask16 __A, __m512 __B)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B,
+						  _mm256_setzero_ph (),
+						  __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtx_roundps_ph (__m512 __A, int __B)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A,
+						  _mm256_setzero_ph (),
+						  (__mmask16) -1,
+						  __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtx_roundps_ph (__m256h __A, __mmask16 __B, __m512 __C, int __D)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C,
+						  __A, __B, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtx_roundps_ph (__mmask16 __A, __m512 __B, int __C)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B,
+						  _mm256_setzero_ph (),
+						  __A, __C);
+}
+
+#else
+#define _mm512_cvtx_roundps_ph(A, B)				\
+  (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(A),	\
+					    _mm256_setzero_ph (),\
+					    (__mmask16)-1, (B)))
+
+#define _mm512_mask_cvtx_roundps_ph(A, B, C, D)			\
+  (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(C),	\
+					    (A), (B), (D)))
+
+#define _mm512_maskz_cvtx_roundps_ph(A, B, C)			\
+  (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(B),	\
+					    _mm256_setzero_ph (),\
+					    (A), (C)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtpd2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_ph (__m512d __A)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A,
+						 _mm_setzero_ph (),
+						 (__mmask8) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m512d __C)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C,
+						 __A, __B,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_ph (__mmask8 __A, __m512d __B)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B,
+						 _mm_setzero_ph (),
+						 __A,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_ph (__m512d __A, int __B)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A,
+						 _mm_setzero_ph (),
+						 (__mmask8) -1,
+						 __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_ph (__m128h __A, __mmask8 __B, __m512d __C, int __D)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C,
+						 __A, __B, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_ph (__mmask8 __A, __m512d __B, int __C)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B,
+						 _mm_setzero_ph (),
+						 __A, __C);
+}
+
+#else
+#define _mm512_cvt_roundpd_ph(A, B)				\
+  (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(A),		\
+					   _mm_setzero_ph (),	\
+					   (__mmask8)-1, (B)))
+
+#define _mm512_mask_cvt_roundpd_ph(A, B, C, D)			\
+  (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(C),		\
+					   (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundpd_ph(A, B, C)			\
+  (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(B),		\
+					   _mm_setzero_ph (),	\
+					   (A), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtsh2ss, vcvtsh2sd.  */
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_ss (__m128 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A,
+					      _mm_setzero_ps (),
+					      (__mmask8) -1,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsh_ss (__m128 __A, __mmask8 __B, __m128 __C,
+			 __m128h __D)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsh_ss (__mmask8 __A, __m128 __B,
+			  __m128h __C)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B,
+					      _mm_setzero_ps (),
+					      __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_sd (__m128d __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A,
+					      _mm_setzero_pd (),
+					      (__mmask8) -1,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsh_sd (__m128d __A, __mmask8 __B, __m128d __C,
+			 __m128h __D)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsh_sd (__mmask8 __A, __m128d __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B,
+					      _mm_setzero_pd (),
+					      __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_ss (__m128 __A, __m128h __B, const int __R)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A,
+					      _mm_setzero_ps (),
+					      (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsh_ss (__m128 __A, __mmask8 __B, __m128 __C,
+			 __m128h __D, const int __R)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsh_ss (__mmask8 __A, __m128 __B,
+			  __m128h __C, const int __R)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B,
+					      _mm_setzero_ps (),
+					      __A, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_sd (__m128d __A, __m128h __B, const int __R)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A,
+					      _mm_setzero_pd (),
+					      (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsh_sd (__m128d __A, __mmask8 __B, __m128d __C,
+			 __m128h __D, const int __R)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsh_sd (__mmask8 __A, __m128d __B, __m128h __C, const int __R)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B,
+					      _mm_setzero_pd (),
+					      __A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_ss(A, B, R)				\
+  (__builtin_ia32_vcvtsh2ss_mask_round ((B), (A),		\
+					_mm_setzero_ps (),	\
+					(__mmask8) -1, (R)))
+
+#define _mm_mask_cvt_roundsh_ss(A, B, C, D, R)				\
+  (__builtin_ia32_vcvtsh2ss_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsh_ss(A, B, C, R)			\
+  (__builtin_ia32_vcvtsh2ss_mask_round ((C), (B),		\
+					_mm_setzero_ps (),	\
+					(A), (R)))
+
+#define _mm_cvt_roundsh_sd(A, B, R)				\
+  (__builtin_ia32_vcvtsh2sd_mask_round ((B), (A),		\
+					_mm_setzero_pd (),	\
+					(__mmask8) -1, (R)))
+
+#define _mm_mask_cvt_roundsh_sd(A, B, C, D, R)				\
+  (__builtin_ia32_vcvtsh2sd_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsh_sd(A, B, C, R)			\
+  (__builtin_ia32_vcvtsh2sd_mask_round ((C), (B),		\
+					_mm_setzero_pd (),	\
+					(A), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtss2sh, vcvtsd2sh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_sh (__m128h __A, __m128 __B)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__B, __A,
+					      _mm_setzero_ph (),
+					      (__mmask8) -1,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtss_sh (__mmask8 __A, __m128h __B, __m128 __C)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__C, __B,
+					      _mm_setzero_ph (),
+					      __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_sh (__m128h __A, __m128d __B)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A,
+					      _mm_setzero_ph (),
+					      (__mmask8) -1,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsd_sh (__mmask8 __A, __m128h __B, __m128d __C)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B,
+					      _mm_setzero_ph (),
+					      __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_sh (__m128h __A, __m128 __B, const int __R)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__B, __A,
+					      _mm_setzero_ph (),
+					      (__mmask8) -1, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D,
+			 const int __R)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundss_sh (__mmask8 __A, __m128h __B, __m128 __C,
+			  const int __R)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__C, __B,
+					      _mm_setzero_ph (),
+					      __A, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_sh (__m128h __A, __m128d __B, const int __R)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A,
+					      _mm_setzero_ph (),
+					      (__mmask8) -1, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D,
+			 const int __R)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsd_sh (__mmask8 __A, __m128h __B, __m128d __C,
+			  const int __R)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B,
+					      _mm_setzero_ph (),
+					      __A, __R);
+}
+
+#else
+#define _mm_cvt_roundss_sh(A, B, R)				\
+  (__builtin_ia32_vcvtss2sh_mask_round ((B), (A),		\
+					_mm_setzero_ph (),	\
+					(__mmask8) -1, R))
+
+#define _mm_mask_cvt_roundss_sh(A, B, C, D, R)				\
+  (__builtin_ia32_vcvtss2sh_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundss_sh(A, B, C, R)			\
+  (__builtin_ia32_vcvtss2sh_mask_round ((C), (B),		\
+					_mm_setzero_ph (),	\
+					A, R))
+
+#define _mm_cvt_roundsd_sh(A, B, R)				\
+  (__builtin_ia32_vcvtsd2sh_mask_round ((B), (A),		\
+					_mm_setzero_ph (),	\
+					(__mmask8) -1, R))
+
+#define _mm_mask_cvt_roundsd_sh(A, B, C, D, R)				\
+  (__builtin_ia32_vcvtsd2sh_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsd_sh(A, B, C, R)			\
+  (__builtin_ia32_vcvtsd2sh_mask_round ((C), (B),		\
+					_mm_setzero_ph (),	\
+					(A), (R)))
+
+#endif /* __OPTIMIZE__ */
+
 #ifdef __DISABLE_AVX512FP16__
 #undef __DISABLE_AVX512FP16__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h
index 1787ed5..59906d2 100644
--- a/gcc/config/i386/avx512fp16vlintrin.h
+++ b/gcc/config/i386/avx512fp16vlintrin.h
@@ -34,6 +34,123 @@
 #define __DISABLE_AVX512FP16VL__
 #endif /* __AVX512FP16VL__ */
 
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_ps (__m128h __a)
+{
+  return (__m128) __a;
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_ps (__m256h __a)
+{
+  return (__m256) __a;
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_pd (__m128h __a)
+{
+  return (__m128d) __a;
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_pd (__m256h __a)
+{
+  return (__m256d) __a;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_si128 (__m128h __a)
+{
+  return (__m128i) __a;
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_si256 (__m256h __a)
+{
+  return (__m256i) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_ph (__m128 __a)
+{
+  return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_ph (__m256 __a)
+{
+  return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ph (__m128d __a)
+{
+  return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_ph (__m256d __a)
+{
+  return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ph (__m128i __a)
+{
+  return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_ph (__m256i __a)
+{
+  return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph256_ph128 (__m256h __A)
+{
+  union
+  {
+    __m128h a[2];
+    __m256h v;
+  } u = { .v = __A };
+  return u.a[0];
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph128_ph256 (__m128h __A)
+{
+  union
+  {
+    __m128h a[2];
+    __m256h v;
+  } u;
+  u.a[0] = __A;
+  return u.v;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextph128_ph256 (__m128h __A)
+{
+  return (__m256h) _mm256_insertf128_ps (_mm256_setzero_ps (),
+					 (__m128) __A, 0);
+}
+
 /* Intrinsics v[add,sub,mul,div]ph.  */
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -53,30 +170,30 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vaddph_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_addph128_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
 {
-  return __builtin_ia32_vaddph_v16hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_addph256_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vaddph_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_addph128_mask (__B, __C, _mm_setzero_ph (),
+				       __A);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C)
 {
-  return __builtin_ia32_vaddph_v16hf_mask (__B, __C,
-					   _mm256_setzero_ph (), __A);
+  return __builtin_ia32_addph256_mask (__B, __C,
+				       _mm256_setzero_ph (), __A);
 }
 
 extern __inline __m128h
@@ -97,30 +214,30 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vsubph_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_subph128_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
 {
-  return __builtin_ia32_vsubph_v16hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_subph256_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vsubph_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_subph128_mask (__B, __C, _mm_setzero_ph (),
+				       __A);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C)
 {
-  return __builtin_ia32_vsubph_v16hf_mask (__B, __C,
-					   _mm256_setzero_ph (), __A);
+  return __builtin_ia32_subph256_mask (__B, __C,
+				       _mm256_setzero_ph (), __A);
 }
 
 extern __inline __m128h
@@ -141,30 +258,30 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vmulph_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_mulph128_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
 {
-  return __builtin_ia32_vmulph_v16hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_mulph256_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vmulph_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_mulph128_mask (__B, __C, _mm_setzero_ph (),
+				       __A);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C)
 {
-  return __builtin_ia32_vmulph_v16hf_mask (__B, __C,
-					   _mm256_setzero_ph (), __A);
+  return __builtin_ia32_mulph256_mask (__B, __C,
+				       _mm256_setzero_ph (), __A);
 }
 
 extern __inline __m128h
@@ -185,30 +302,30 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vdivph_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_divph128_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
 {
-  return __builtin_ia32_vdivph_v16hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_divph256_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vdivph_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_divph128_mask (__B, __C, _mm_setzero_ph (),
+				       __A);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C)
 {
-  return __builtin_ia32_vdivph_v16hf_mask (__B, __C,
-					   _mm256_setzero_ph (), __A);
+  return __builtin_ia32_divph256_mask (__B, __C,
+				       _mm256_setzero_ph (), __A);
 }
 
 /* Intrinsics v[max,min]ph.  */
@@ -216,96 +333,96 @@ extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_max_ph (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vmaxph_v8hf_mask (__A, __B,
-					  _mm_setzero_ph (),
-					  (__mmask8) -1);
+  return __builtin_ia32_maxph128_mask (__A, __B,
+				       _mm_setzero_ph (),
+				       (__mmask8) -1);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_max_ph (__m256h __A, __m256h __B)
 {
-  return __builtin_ia32_vmaxph_v16hf_mask (__A, __B,
-					  _mm256_setzero_ph (),
-					  (__mmask16) -1);
+  return __builtin_ia32_maxph256_mask (__A, __B,
+				       _mm256_setzero_ph (),
+				       (__mmask16) -1);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_max_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vmaxph_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_maxph128_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_max_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
 {
-  return __builtin_ia32_vmaxph_v16hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_maxph256_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_max_ph (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vmaxph_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_maxph128_mask (__B, __C, _mm_setzero_ph (),
+				       __A);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_maskz_max_ph (__mmask16 __A, __m256h __B, __m256h __C)
 {
-  return __builtin_ia32_vmaxph_v16hf_mask (__B, __C,
-					   _mm256_setzero_ph (), __A);
+  return __builtin_ia32_maxph256_mask (__B, __C,
+				       _mm256_setzero_ph (), __A);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_ph (__m128h __A, __m128h __B)
 {
-  return __builtin_ia32_vminph_v8hf_mask (__A, __B,
-					  _mm_setzero_ph (),
-					  (__mmask8) -1);
+  return __builtin_ia32_minph128_mask (__A, __B,
+				       _mm_setzero_ph (),
+				       (__mmask8) -1);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_min_ph (__m256h __A, __m256h __B)
 {
-  return __builtin_ia32_vminph_v16hf_mask (__A, __B,
-					  _mm256_setzero_ph (),
-					  (__mmask16) -1);
+  return __builtin_ia32_minph256_mask (__A, __B,
+				       _mm256_setzero_ph (),
+				       (__mmask16) -1);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_min_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
 {
-  return __builtin_ia32_vminph_v8hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_minph128_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mask_min_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
 {
-  return __builtin_ia32_vminph_v16hf_mask (__C, __D, __A, __B);
+  return __builtin_ia32_minph256_mask (__C, __D, __A, __B);
 }
 
 extern __inline __m128h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskz_min_ph (__mmask8 __A, __m128h __B, __m128h __C)
 {
-  return __builtin_ia32_vminph_v8hf_mask (__B, __C, _mm_setzero_ph (),
-					  __A);
+  return __builtin_ia32_minph128_mask (__B, __C, _mm_setzero_ph (),
+				       __A);
 }
 
 extern __inline __m256h
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C)
 {
-  return __builtin_ia32_vminph_v16hf_mask (__B, __C,
-					   _mm256_setzero_ph (), __A);
+  return __builtin_ia32_minph256_mask (__B, __C,
+				       _mm256_setzero_ph (), __A);
 }
 
 /* vcmpph */
@@ -314,8 +431,8 @@ extern __inline __mmask8
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmp_ph_mask (__m128h __A, __m128h __B, const int __C)
 {
-  return (__mmask8) __builtin_ia32_vcmpph_v8hf_mask (__A, __B, __C,
-						     (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpph128_mask (__A, __B, __C,
+						  (__mmask8) -1);
 }
 
 extern __inline __mmask8
@@ -323,15 +440,15 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_cmp_ph_mask (__mmask8 __A, __m128h __B, __m128h __C,
 		      const int __D)
 {
-  return (__mmask8) __builtin_ia32_vcmpph_v8hf_mask (__B, __C, __D, __A);
+  return (__mmask8) __builtin_ia32_cmpph128_mask (__B, __C, __D, __A);
 }
 
 extern __inline __mmask16
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmp_ph_mask (__m256h __A, __m256h __B, const int __C)
 {
-  return (__mmask16) __builtin_ia32_vcmpph_v16hf_mask (__A, __B, __C,
-						       (__mmask16) -1);
+  return (__mmask16) __builtin_ia32_cmpph256_mask (__A, __B, __C,
+						   (__mmask16) -1);
 }
 
 extern __inline __mmask16
@@ -339,25 +456,1819 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mask_cmp_ph_mask (__mmask16 __A, __m256h __B, __m256h __C,
 		      const int __D)
 {
-  return (__mmask16) __builtin_ia32_vcmpph_v16hf_mask (__B, __C, __D,
-						       __A);
+  return (__mmask16) __builtin_ia32_cmpph256_mask (__B, __C, __D,
+						   __A);
 }
 
 #else
-#define _mm_cmp_ph_mask(A, B, C)		\
-  (__builtin_ia32_vcmpph_v8hf_mask ((A), (B), (C), (-1)))
+#define _mm_cmp_ph_mask(A, B, C)			\
+  (__builtin_ia32_cmpph128_mask ((A), (B), (C), (-1)))
 
-#define _mm_mask_cmp_ph_mask(A, B, C, D)	\
-  (__builtin_ia32_vcmpph_v8hf_mask ((B), (C), (D), (A)))
+#define _mm_mask_cmp_ph_mask(A, B, C, D)		\
+  (__builtin_ia32_cmpph128_mask ((B), (C), (D), (A)))
 
-#define _mm256_cmp_ph_mask(A, B, C)		\
-  (__builtin_ia32_vcmpph_v16hf_mask ((A), (B), (C), (-1)))
+#define _mm256_cmp_ph_mask(A, B, C)			\
+  (__builtin_ia32_cmpph256_mask ((A), (B), (C), (-1)))
 
-#define _mm256_mask_cmp_ph_mask(A, B, C, D)	\
-  (__builtin_ia32_vcmpph_v16hf_mask ((B), (C), (D), (A)))
+#define _mm256_mask_cmp_ph_mask(A, B, C, D)		\
+  (__builtin_ia32_cmpph256_mask ((B), (C), (D), (A)))
 
 #endif /* __OPTIMIZE__ */
 
+/* Intrinsics vsqrtph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ph (__m128h __A)
+{
+  return __builtin_ia32_sqrtph128_mask (__A, _mm_setzero_ph (),
+					(__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_ph (__m256h __A)
+{
+  return __builtin_ia32_sqrtph256_mask (__A, _mm256_setzero_ph (),
+					(__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_sqrtph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+  return __builtin_ia32_sqrtph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_ph (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_sqrtph128_mask (__B, _mm_setzero_ph (),
+					__A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_ph (__mmask16 __A, __m256h __B)
+{
+  return __builtin_ia32_sqrtph256_mask (__B, _mm256_setzero_ph (),
+					__A);
+}
+
+/* Intrinsics vrsqrtph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ph (__m128h __A)
+{
+  return __builtin_ia32_rsqrtph128_mask (__A, _mm_setzero_ph (),
+					 (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt_ph (__m256h __A)
+{
+  return __builtin_ia32_rsqrtph256_mask (__A, _mm256_setzero_ph (),
+					 (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_rsqrtph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rsqrt_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+  return __builtin_ia32_rsqrtph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt_ph (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_rsqrtph128_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rsqrt_ph (__mmask16 __A, __m256h __B)
+{
+  return __builtin_ia32_rsqrtph256_mask (__B, _mm256_setzero_ph (),
+					 __A);
+}
+
+/* Intrinsics vrcpph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ph (__m128h __A)
+{
+  return __builtin_ia32_rcpph128_mask (__A, _mm_setzero_ph (),
+				       (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp_ph (__m256h __A)
+{
+  return __builtin_ia32_rcpph256_mask (__A, _mm256_setzero_ph (),
+				       (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_rcpph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rcp_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+  return __builtin_ia32_rcpph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp_ph (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_rcpph128_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rcp_ph (__mmask16 __A, __m256h __B)
+{
+  return __builtin_ia32_rcpph256_mask (__B, _mm256_setzero_ph (),
+				       __A);
+}
+
+/* Intrinsics vscalefph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_ph (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_scalefph128_mask (__A, __B,
+					  _mm_setzero_ph (),
+					  (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_ph (__m256h __A, __m256h __B)
+{
+  return __builtin_ia32_scalefph256_mask (__A, __B,
+					  _mm256_setzero_ph (),
+					  (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_scalefph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_ph (__m256h __A, __mmask16 __B, __m256h __C,
+		       __m256h __D)
+{
+  return __builtin_ia32_scalefph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_scalefph128_mask (__B, __C,
+					  _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_scalefph256_mask (__B, __C,
+					  _mm256_setzero_ph (),
+					  __A);
+}
+
+/* Intrinsics vreduceph.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_ph (__m128h __A, int __B)
+{
+  return __builtin_ia32_reduceph128_mask (__A, __B,
+					  _mm_setzero_ph (),
+					  (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_reduceph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_ph (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_reduceph128_mask (__B, __C,
+					  _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_ph (__m256h __A, int __B)
+{
+  return __builtin_ia32_reduceph256_mask (__A, __B,
+					  _mm256_setzero_ph (),
+					  (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_ph (__m256h __A, __mmask16 __B, __m256h __C, int __D)
+{
+  return __builtin_ia32_reduceph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_ph (__mmask16 __A, __m256h __B, int __C)
+{
+  return __builtin_ia32_reduceph256_mask (__B, __C,
+					  _mm256_setzero_ph (),
+					  __A);
+}
+
+#else
+#define _mm_reduce_ph(A, B)				\
+  (__builtin_ia32_reduceph128_mask ((A), (B),		\
+				    _mm_setzero_ph (),	\
+				    ((__mmask8)-1)))
+
+#define _mm_mask_reduce_ph(A,  B,  C, D)			\
+  (__builtin_ia32_reduceph128_mask ((C), (D), (A), (B)))
+
+#define _mm_maskz_reduce_ph(A,  B, C)					\
+  (__builtin_ia32_reduceph128_mask ((B), (C), _mm_setzero_ph (), (A)))
+
+#define _mm256_reduce_ph(A, B)					\
+  (__builtin_ia32_reduceph256_mask ((A), (B),			\
+				    _mm256_setzero_ph (),	\
+				    ((__mmask16)-1)))
+
+#define _mm256_mask_reduce_ph(A, B, C, D)			\
+  (__builtin_ia32_reduceph256_mask ((C), (D), (A), (B)))
+
+#define _mm256_maskz_reduce_ph(A, B, C)					\
+  (__builtin_ia32_reduceph256_mask ((B), (C), _mm256_setzero_ph (), (A)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscaleph.  */
+#ifdef __OPTIMIZE__
+  extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+  _mm_roundscale_ph (__m128h __A, int __B)
+  {
+    return __builtin_ia32_rndscaleph128_mask (__A, __B,
+					      _mm_setzero_ph (),
+					      (__mmask8) -1);
+  }
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_rndscaleph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_ph (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_rndscaleph128_mask (__B, __C,
+					    _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_roundscale_ph (__m256h __A, int __B)
+{
+  return __builtin_ia32_rndscaleph256_mask (__A, __B,
+					    _mm256_setzero_ph (),
+					    (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_roundscale_ph (__m256h __A, __mmask16 __B, __m256h __C,
+			   int __D)
+{
+  return __builtin_ia32_rndscaleph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_roundscale_ph (__mmask16 __A, __m256h __B, int __C)
+{
+  return __builtin_ia32_rndscaleph256_mask (__B, __C,
+					    _mm256_setzero_ph (),
+					    __A);
+}
+
+#else
+#define _mm_roundscale_ph(A, B)						\
+  (__builtin_ia32_rndscaleph128_mask ((A), (B), _mm_setzero_ph (),	\
+				      ((__mmask8)-1)))
+
+#define _mm_mask_roundscale_ph(A, B, C, D)			\
+  (__builtin_ia32_rndscaleph128_mask ((C), (D), (A), (B)))
+
+#define _mm_maskz_roundscale_ph(A, B, C)				\
+  (__builtin_ia32_rndscaleph128_mask ((B), (C), _mm_setzero_ph (), (A)))
+
+#define _mm256_roundscale_ph(A, B)				\
+  (__builtin_ia32_rndscaleph256_mask ((A), (B),			\
+				      _mm256_setzero_ph(),	\
+				      ((__mmask16)-1)))
+
+#define _mm256_mask_roundscale_ph(A, B, C, D)			\
+  (__builtin_ia32_rndscaleph256_mask ((C), (D), (A), (B)))
+
+#define _mm256_maskz_roundscale_ph(A, B, C)				\
+  (__builtin_ia32_rndscaleph256_mask ((B), (C),				\
+				      _mm256_setzero_ph (), (A)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclassph.  */
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+  _mm_mask_fpclass_ph_mask (__mmask8 __U, __m128h __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A,
+						      __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_ph_mask (__m128h __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A,
+						      __imm,
+						      (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fpclass_ph_mask (__mmask16 __U, __m256h __A, const int __imm)
+{
+  return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A,
+						       __imm, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fpclass_ph_mask (__m256h __A, const int __imm)
+{
+  return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A,
+						       __imm,
+						       (__mmask16) -1);
+}
+
+#else
+#define _mm_fpclass_ph_mask(X, C)                                       \
+  ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X),	\
+						(int) (C),(__mmask8)-1))
+
+#define _mm_mask_fpclass_ph_mask(u, X, C)                               \
+  ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X),	\
+						(int) (C),(__mmask8)(u)))
+
+#define _mm256_fpclass_ph_mask(X, C)                                    \
+  ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \
+						 (int) (C),(__mmask16)-1))
+
+#define _mm256_mask_fpclass_ph_mask(u, X, C)				\
+  ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \
+						 (int) (C),(__mmask16)(u)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vgetexpph, vgetexpsh.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_ph (__m256h __A)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+						    (__v16hf)
+						    _mm256_setzero_ph (),
+						    (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_ph (__m256h __W, __mmask16 __U, __m256h __A)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+						    (__v16hf) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_ph (__mmask16 __U, __m256h __A)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+						    (__v16hf)
+						    _mm256_setzero_ph (),
+						    (__mmask16) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_ph (__m128h __A)
+{
+  return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+						    (__v8hf)
+						    _mm_setzero_ph (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_ph (__m128h __W, __mmask8 __U, __m128h __A)
+{
+  return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+						    (__v8hf) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_ph (__mmask8 __U, __m128h __A)
+{
+  return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+						    (__v8hf)
+						    _mm_setzero_ph (),
+						    (__mmask8) __U);
+}
+
+
+/* Intrinsics vgetmantph, vgetmantsh.  */
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getmant_ph (__m256h __A, _MM_MANTISSA_NORM_ENUM __B,
+		   _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+						     (__C << 2) | __B,
+						     (__v16hf)
+						     _mm256_setzero_ph (),
+						     (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getmant_ph (__m256h __W, __mmask16 __U, __m256h __A,
+			_MM_MANTISSA_NORM_ENUM __B,
+			_MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+						     (__C << 2) | __B,
+						     (__v16hf) __W,
+						     (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getmant_ph (__mmask16 __U, __m256h __A,
+			 _MM_MANTISSA_NORM_ENUM __B,
+			 _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+						     (__C << 2) | __B,
+						     (__v16hf)
+						     _mm256_setzero_ph (),
+						     (__mmask16) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_ph (__m128h __A, _MM_MANTISSA_NORM_ENUM __B,
+		_MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+						     (__C << 2) | __B,
+						     (__v8hf)
+						     _mm_setzero_ph (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_ph (__m128h __W, __mmask8 __U, __m128h __A,
+		     _MM_MANTISSA_NORM_ENUM __B,
+		     _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+						     (__C << 2) | __B,
+						     (__v8hf) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_ph (__mmask8 __U, __m128h __A,
+		      _MM_MANTISSA_NORM_ENUM __B,
+		      _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+						     (__C << 2) | __B,
+						     (__v8hf)
+						     _mm_setzero_ph (),
+						     (__mmask8) __U);
+}
+
+#else
+#define _mm256_getmant_ph(X, B, C)					\
+  ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X),	\
+					       (int)(((C)<<2) | (B)),	\
+					       (__v16hf)(__m256h)_mm256_setzero_ph (), \
+					       (__mmask16)-1))
+
+#define _mm256_mask_getmant_ph(W, U, X, B, C)				\
+  ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X),	\
+					       (int)(((C)<<2) | (B)),	\
+					       (__v16hf)(__m256h)(W),	\
+					       (__mmask16)(U)))
+
+#define _mm256_maskz_getmant_ph(U, X, B, C)				\
+  ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X),	\
+					       (int)(((C)<<2) | (B)),	\
+					       (__v16hf)(__m256h)_mm256_setzero_ph (), \
+					       (__mmask16)(U)))
+
+#define _mm_getmant_ph(X, B, C)						\
+  ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X),	\
+					       (int)(((C)<<2) | (B)),	\
+					       (__v8hf)(__m128h)_mm_setzero_ph (), \
+					       (__mmask8)-1))
+
+#define _mm_mask_getmant_ph(W, U, X, B, C)				\
+  ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X),	\
+					       (int)(((C)<<2) | (B)),	\
+					       (__v8hf)(__m128h)(W),	\
+					       (__mmask8)(U)))
+
+#define _mm_maskz_getmant_ph(U, X, B, C)				\
+  ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X),	\
+					       (int)(((C)<<2) | (B)),	\
+					       (__v8hf)(__m128h)_mm_setzero_ph (), \
+					       (__mmask8)(U)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2dq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi32 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2dq128_mask (__A,
+				      (__v4si)
+				      _mm_setzero_si128 (),
+				      (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2dq128_mask (__C, ( __v4si) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2dq128_mask (__B,
+				      (__v4si) _mm_setzero_si128 (),
+				      __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi32 (__m128h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2dq256_mask (__A,
+				      (__v8si)
+				      _mm256_setzero_si256 (),
+				      (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2dq256_mask (__C, ( __v8si) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2dq256_mask (__B,
+				      (__v8si)
+				      _mm256_setzero_si256 (),
+				      __A);
+}
+
+/* Intrinsics vcvtph2udq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu32 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2udq128_mask (__A,
+				       (__v4si)
+				       _mm_setzero_si128 (),
+				       (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2udq128_mask (__C, ( __v4si) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2udq128_mask (__B,
+				       (__v4si)
+				       _mm_setzero_si128 (),
+				       __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu32 (__m128h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2udq256_mask (__A,
+				       (__v8si)
+				       _mm256_setzero_si256 (),
+				       (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2udq256_mask (__C, ( __v8si) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2udq256_mask (__B,
+				       (__v8si) _mm256_setzero_si256 (),
+				       __A);
+}
+
+/* Intrinsics vcvttph2dq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi32 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2dq128_mask (__A,
+				       (__v4si) _mm_setzero_si128 (),
+				       (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)__builtin_ia32_vcvttph2dq128_mask (__C,
+						     ( __v4si) __A,
+						     __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2dq128_mask (__B,
+				       (__v4si) _mm_setzero_si128 (),
+				       __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi32 (__m128h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2dq256_mask (__A,
+				       (__v8si)
+				       _mm256_setzero_si256 (),
+				       (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2dq256_mask (__C,
+				       ( __v8si) __A,
+				       __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2dq256_mask (__B,
+				       (__v8si)
+				       _mm256_setzero_si256 (),
+				       __A);
+}
+
+/* Intrinsics vcvttph2udq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu32 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2udq128_mask (__A,
+					(__v4si)
+					_mm_setzero_si128 (),
+					(__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2udq128_mask (__C,
+					( __v4si) __A,
+					__B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2udq128_mask (__B,
+					(__v4si)
+					_mm_setzero_si128 (),
+					__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu32 (__m128h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2udq256_mask (__A,
+					(__v8si)
+					_mm256_setzero_si256 (), (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2udq256_mask (__C,
+					( __v8si) __A,
+					__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2udq256_mask (__B,
+					(__v8si)
+					_mm256_setzero_si256 (),
+					__A);
+}
+
+/* Intrinsics vcvtdq2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __A,
+					   _mm_setzero_ph (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __B,
+					   _mm_setzero_ph (),
+					   __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __A,
+					   _mm_setzero_ph (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_ph (__mmask8 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __B,
+					   _mm_setzero_ph (),
+					   __A);
+}
+
+/* Intrinsics vcvtudq2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __A,
+					    _mm_setzero_ph (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __C,
+					    __A,
+					    __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu32_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __B,
+					    _mm_setzero_ph (),
+					    __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __A,
+					    _mm_setzero_ph (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu32_ph (__mmask8 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __B,
+					    _mm_setzero_ph (),
+					    __A);
+}
+
+/* Intrinsics vcvtph2qq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi64 (__m128h __A)
+{
+  return
+    __builtin_ia32_vcvtph2qq128_mask (__A,
+				      _mm_setzero_si128 (),
+				      (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2qq128_mask (__C, __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2qq128_mask (__B,
+					   _mm_setzero_si128 (),
+					   __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi64 (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2qq256_mask (__A,
+					   _mm256_setzero_si256 (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2qq256_mask (__C, __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2qq256_mask (__B,
+					   _mm256_setzero_si256 (),
+					   __A);
+}
+
+/* Intrinsics vcvtph2uqq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2uqq128_mask (__A,
+					    _mm_setzero_si128 (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2uqq128_mask (__C, __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2uqq128_mask (__B,
+					    _mm_setzero_si128 (),
+					    __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2uqq256_mask (__A,
+					    _mm256_setzero_si256 (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2uqq256_mask (__C, __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2uqq256_mask (__B,
+					    _mm256_setzero_si256 (),
+					    __A);
+}
+
+/* Intrinsics vcvttph2qq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2qq128_mask (__A,
+					    _mm_setzero_si128 (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2qq128_mask (__C,
+					    __A,
+					    __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2qq128_mask (__B,
+					    _mm_setzero_si128 (),
+					    __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2qq256_mask (__A,
+					    _mm256_setzero_si256 (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2qq256_mask (__C,
+					    __A,
+					    __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2qq256_mask (__B,
+					    _mm256_setzero_si256 (),
+					    __A);
+}
+
+/* Intrinsics vcvttph2uqq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2uqq128_mask (__A,
+					     _mm_setzero_si128 (),
+					     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2uqq128_mask (__C,
+					     __A,
+					     __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2uqq128_mask (__B,
+					     _mm_setzero_si128 (),
+					     __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2uqq256_mask (__A,
+					     _mm256_setzero_si256 (),
+					     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2uqq256_mask (__C,
+					     __A,
+					     __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2uqq256_mask (__B,
+					     _mm256_setzero_si256 (),
+					     __A);
+}
+
+/* Intrinsics vcvtqq2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __A,
+					   _mm_setzero_ph (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __B,
+					   _mm_setzero_ph (),
+					   __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __A,
+					   _mm_setzero_ph (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_ph (__mmask8 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __B,
+					   _mm_setzero_ph (),
+					   __A);
+}
+
+/* Intrinsics vcvtuqq2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu64_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __A,
+					    _mm_setzero_ph (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu64_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __B,
+					    _mm_setzero_ph (),
+					    __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu64_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __A,
+					    _mm_setzero_ph (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu64_ph (__mmask8 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __B,
+					    _mm_setzero_ph (),
+					    __A);
+}
+
+/* Intrinsics vcvtph2w.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi16 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2w128_mask (__A,
+				     (__v8hi)
+				     _mm_setzero_si128 (),
+				     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2w128_mask (__C, ( __v8hi) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi16 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2w128_mask (__B,
+				     (__v8hi)
+				     _mm_setzero_si128 (),
+				     __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi16 (__m256h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2w256_mask (__A,
+				     (__v16hi)
+				     _mm256_setzero_si256 (),
+				     (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2w256_mask (__C, ( __v16hi) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi16 (__mmask16 __A, __m256h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2w256_mask (__B,
+				     (__v16hi)
+				     _mm256_setzero_si256 (),
+				     __A);
+}
+
+/* Intrinsics vcvtph2uw.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu16 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2uw128_mask (__A,
+				      (__v8hi)
+				      _mm_setzero_si128 (),
+				      (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2uw128_mask (__C, ( __v8hi) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu16 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2uw128_mask (__B,
+				      (__v8hi)
+				      _mm_setzero_si128 (),
+				      __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu16 (__m256h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2uw256_mask (__A,
+				      (__v16hi)
+				      _mm256_setzero_si256 (),
+				      (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2uw256_mask (__C, ( __v16hi) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu16 (__mmask16 __A, __m256h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2uw256_mask (__B,
+				      (__v16hi)
+				      _mm256_setzero_si256 (),
+				      __A);
+}
+
+/* Intrinsics vcvttph2w.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi16 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2w128_mask (__A,
+				      (__v8hi)
+				      _mm_setzero_si128 (),
+				      (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2w128_mask (__C,
+				      ( __v8hi) __A,
+				      __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi16 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2w128_mask (__B,
+				      (__v8hi)
+				      _mm_setzero_si128 (),
+				      __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi16 (__m256h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2w256_mask (__A,
+				      (__v16hi)
+				      _mm256_setzero_si256 (),
+				      (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2w256_mask (__C,
+				      ( __v16hi) __A,
+				      __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi16 (__mmask16 __A, __m256h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2w256_mask (__B,
+				      (__v16hi)
+				      _mm256_setzero_si256 (),
+				      __A);
+}
+
+/* Intrinsics vcvttph2uw.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu16 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2uw128_mask (__A,
+				       (__v8hi)
+				       _mm_setzero_si128 (),
+				       (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2uw128_mask (__C,
+				       ( __v8hi) __A,
+				       __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu16 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2uw128_mask (__B,
+				       (__v8hi)
+				       _mm_setzero_si128 (),
+				       __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu16 (__m256h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2uw256_mask (__A,
+				       (__v16hi)
+				       _mm256_setzero_si256 (),
+				       (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2uw256_mask (__C,
+				       ( __v16hi) __A,
+				       __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu16 (__mmask16 __A, __m256h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2uw256_mask (__B,
+				       (__v16hi) _mm256_setzero_si256 (),
+				       __A);
+}
+
+/* Intrinsics vcvtw2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __A,
+					  _mm_setzero_ph (),
+					  (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __C,
+					  __A,
+					  __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi16_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __B,
+					  _mm_setzero_ph (),
+					  __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __A,
+					  _mm256_setzero_ph (),
+					  (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_ph (__m256h __A, __mmask16 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __C,
+					  __A,
+					  __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi16_ph (__mmask16 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __B,
+					  _mm256_setzero_ph (),
+					  __A);
+}
+
+/* Intrinsics vcvtuw2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __A,
+					   _mm_setzero_ph (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu16_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu16_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __B,
+					   _mm_setzero_ph (),
+					   __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __A,
+					   _mm256_setzero_ph (),
+					   (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu16_ph (__m256h __A, __mmask16 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu16_ph (__mmask16 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __B,
+					   _mm256_setzero_ph (),
+					   __A);
+}
+
+/* Intrinsics vcvtph2pd.  */
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_pd (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2pd128_mask (__A,
+					   _mm_setzero_pd (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_pd (__m128d __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2pd128_mask (__C, __A, __B);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2pd128_mask (__B, _mm_setzero_pd (), __A);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_pd (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2pd256_mask (__A,
+					   _mm256_setzero_pd (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_pd (__m256d __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2pd256_mask (__C, __A, __B);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2pd256_mask (__B,
+					   _mm256_setzero_pd (),
+					   __A);
+}
+
+/* Intrinsics vcvtph2ps.  */
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtxph_ps (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2psx128_mask (__A,
+					   _mm_setzero_ps (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtxph_ps (__m128 __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2psx128_mask (__C, __A, __B);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtxph_ps (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2psx128_mask (__B, _mm_setzero_ps (), __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtxph_ps (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2psx256_mask (__A,
+					    _mm256_setzero_ps (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtxph_ps (__m256 __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2psx256_mask (__C, __A, __B);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtxph_ps (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2psx256_mask (__B,
+					    _mm256_setzero_ps (),
+					    __A);
+}
+
+/* Intrinsics vcvtxps2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtxps_ph (__m128 __A)
+{
+  return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __A,
+					    _mm_setzero_ph (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m128 __C)
+{
+  return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtxps_ph (__mmask8 __A, __m128 __B)
+{
+  return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __B,
+					    _mm_setzero_ph (),
+					    __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtxps_ph (__m256 __A)
+{
+  return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __A,
+					    _mm_setzero_ph (),
+					    (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m256 __C)
+{
+  return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtxps_ph (__mmask8 __A, __m256 __B)
+{
+  return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __B,
+					    _mm_setzero_ph (),
+					    __A);
+}
+
+/* Intrinsics vcvtpd2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_ph (__m128d __A)
+{
+  return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __A,
+					   _mm_setzero_ph (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m128d __C)
+{
+  return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_ph (__mmask8 __A, __m128d __B)
+{
+  return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __B,
+					   _mm_setzero_ph (),
+					   __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_ph (__m256d __A)
+{
+  return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __A,
+					   _mm_setzero_ph (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m256d __C)
+{
+  return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_ph (__mmask8 __A, __m256d __B)
+{
+  return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __B,
+					   _mm_setzero_ph (),
+					   __A);
+}
+
 #ifdef __DISABLE_AVX512FP16VL__
 #undef __DISABLE_AVX512FP16VL__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index d11c02b..7fd4286 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -134,6 +134,7 @@ DEF_POINTER_TYPE (PCVOID, VOID, CONST)
 DEF_POINTER_TYPE (PVOID, VOID)
 DEF_POINTER_TYPE (PDOUBLE, DOUBLE)
 DEF_POINTER_TYPE (PFLOAT, FLOAT)
+DEF_POINTER_TYPE (PCFLOAT16, FLOAT16, CONST)
 DEF_POINTER_TYPE (PSHORT, SHORT)
 DEF_POINTER_TYPE (PUSHORT, USHORT)
 DEF_POINTER_TYPE (PINT, INT)
@@ -1304,17 +1305,72 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID)
 
 # FP16 builtins
 DEF_FUNCTION_TYPE (V8HF, V8HI)
+DEF_FUNCTION_TYPE (QI, V8HF, INT, UQI)
+DEF_FUNCTION_TYPE (HI, V16HF, INT, UHI)
+DEF_FUNCTION_TYPE (SI, V32HF, INT, USI)
+DEF_FUNCTION_TYPE (INT, V8HF, INT)
+DEF_FUNCTION_TYPE (INT64, V8HF, INT)
+DEF_FUNCTION_TYPE (UINT, V8HF, INT)
+DEF_FUNCTION_TYPE (UINT64, V8HF, INT)
 DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF)
+DEF_FUNCTION_TYPE (VOID, PCFLOAT16, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, PCFLOAT16, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V8HF, INT, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, INT64, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, UINT, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, UINT64, INT)
+DEF_FUNCTION_TYPE (V2DI, V8HF, V2DI, UQI)
+DEF_FUNCTION_TYPE (V4DI, V8HF, V4DI, UQI)
+DEF_FUNCTION_TYPE (V2DF, V8HF, V2DF, UQI)
+DEF_FUNCTION_TYPE (V4DF, V8HF, V4DF, UQI)
+DEF_FUNCTION_TYPE (V4SI, V8HF, V4SI, UQI)
+DEF_FUNCTION_TYPE (V4SF, V8HF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V8SI, V8HF, V8SI, UQI)
+DEF_FUNCTION_TYPE (V8SF, V8HF, V8SF, UQI)
+DEF_FUNCTION_TYPE (V8HI, V8HF, V8HI, UQI)
+DEF_FUNCTION_TYPE (V8HF, V4SI, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V4SF, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V8SI, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V8SF, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V2DI, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V4DI, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V2DF, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V4DF, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V8HI, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, UQI)
 DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, INT, V8HF, UQI)
 DEF_FUNCTION_TYPE (UQI, V8HF, V8HF, INT, UQI)
 DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI)
 DEF_FUNCTION_TYPE (UQI, V8HF, V8HF, INT, UQI, INT)
+DEF_FUNCTION_TYPE (V8DI, V8HF, V8DI, UQI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8HF, V8DF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V8DI, V8HF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V8DF, V8HF, UQI, INT)
 DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V2DF, V8HF, V8HF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V4SF, V8HF, V8HF, UQI, INT)
+DEF_FUNCTION_TYPE (V2DF, V8HF, V2DF, V2DF, UQI, INT)
+DEF_FUNCTION_TYPE (V4SF, V8HF, V4SF, V4SF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT, V8HF, UQI, INT)
 DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF)
+DEF_FUNCTION_TYPE (V16HI, V16HF, V16HI, UHI)
+DEF_FUNCTION_TYPE (V16HF, V16HI, V16HF, UHI)
+DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, UHI)
+DEF_FUNCTION_TYPE (V16SI, V16HF, V16SI, UHI, INT)
+DEF_FUNCTION_TYPE (V16SF, V16HF, V16SF, UHI, INT)
+DEF_FUNCTION_TYPE (V16HF, V16HF, INT, V16HF, UHI)
 DEF_FUNCTION_TYPE (UHI, V16HF, V16HF, INT, UHI)
+DEF_FUNCTION_TYPE (V16HF, V16SI, V16HF, UHI, INT)
+DEF_FUNCTION_TYPE (V16HF, V16SF, V16HF, UHI, INT)
 DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI)
 DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT)
+DEF_FUNCTION_TYPE (V32HI, V32HF, V32HI, USI, INT)
+DEF_FUNCTION_TYPE (V32HF, V32HI, V32HF, USI, INT)
 DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI, INT)
 DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI)
 DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT)
 DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
+DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index c9d80cb..dc56dc2 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -393,6 +393,10 @@ BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mas
 BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI)
 BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI)
 
+/* AVX512FP16 */
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_loadhf_mask, "__builtin_ia32_loadsh_mask", IX86_BUILTIN_LOADSH_MASK, UNKNOWN, (int) V8HF_FTYPE_PCFLOAT16_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_storehf_mask, "__builtin_ia32_storesh_mask", IX86_BUILTIN_STORESH_MASK, UNKNOWN, (int) VOID_FTYPE_PCFLOAT16_V8HF_UQI)
+
 /* RDPKRU and WRPKRU.  */
 BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_rdpkru,  "__builtin_ia32_rdpkru", IX86_BUILTIN_RDPKRU, UNKNOWN, (int) UNSIGNED_FTYPE_VOID)
 BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_wrpkru,  "__builtin_ia32_wrpkru", IX86_BUILTIN_WRPKRU, UNKNOWN, (int) VOID_FTYPE_UNSIGNED)
@@ -2775,33 +2779,102 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__b
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
 
 /* AVX512FP16.  */
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_vaddph_v8hf_mask", IX86_BUILTIN_VADDPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_vaddph_v16hf_mask", IX86_BUILTIN_VADDPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_vaddph_v32hf_mask", IX86_BUILTIN_VADDPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_vsubph_v8hf_mask", IX86_BUILTIN_VSUBPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_vsubph_v16hf_mask", IX86_BUILTIN_VSUBPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_vsubph_v32hf_mask", IX86_BUILTIN_VSUBPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_vmulph_v8hf_mask", IX86_BUILTIN_VMULPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_vmulph_v16hf_mask", IX86_BUILTIN_VMULPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_vmulph_v32hf_mask", IX86_BUILTIN_VMULPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_vdivph_v8hf_mask", IX86_BUILTIN_VDIVPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_vdivph_v16hf_mask", IX86_BUILTIN_VDIVPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_vdivph_v32hf_mask", IX86_BUILTIN_VDIVPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask, "__builtin_ia32_vaddsh_v8hf_mask", IX86_BUILTIN_VADDSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask, "__builtin_ia32_vsubsh_v8hf_mask", IX86_BUILTIN_VSUBSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask, "__builtin_ia32_vmulsh_v8hf_mask", IX86_BUILTIN_VMULSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask, "__builtin_ia32_vdivsh_v8hf_mask", IX86_BUILTIN_VDIVSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv8hf3_mask, "__builtin_ia32_vmaxph_v8hf_mask", IX86_BUILTIN_VMAXPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv16hf3_mask, "__builtin_ia32_vmaxph_v16hf_mask", IX86_BUILTIN_VMAXPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask, "__builtin_ia32_vmaxph_v32hf_mask", IX86_BUILTIN_VMAXPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv8hf3_mask, "__builtin_ia32_vminph_v8hf_mask", IX86_BUILTIN_VMINPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv16hf3_mask, "__builtin_ia32_vminph_v16hf_mask", IX86_BUILTIN_VMINPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask, "__builtin_ia32_vminph_v32hf_mask", IX86_BUILTIN_VMINPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask, "__builtin_ia32_vmaxsh_v8hf_mask", IX86_BUILTIN_VMAXSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask, "__builtin_ia32_vminsh_v8hf_mask", IX86_BUILTIN_VMINSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_cmpv8hf3_mask, "__builtin_ia32_vcmpph_v8hf_mask", IX86_BUILTIN_VCMPPH_V8HF_MASK, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_cmpv16hf3_mask, "__builtin_ia32_vcmpph_v16hf_mask", IX86_BUILTIN_VCMPPH_V16HF_MASK, UNKNOWN, (int) UHI_FTYPE_V16HF_V16HF_INT_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask, "__builtin_ia32_vcmpph_v32hf_mask", IX86_BUILTIN_VCMPPH_V32HF_MASK, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_addph256_mask", IX86_BUILTIN_ADDPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_addph512_mask", IX86_BUILTIN_ADDPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_subph128_mask", IX86_BUILTIN_SUBPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_subph256_mask", IX86_BUILTIN_SUBPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_subph512_mask", IX86_BUILTIN_SUBPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_mulph128_mask", IX86_BUILTIN_MULPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_mulph256_mask", IX86_BUILTIN_MULPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_mulph512_mask", IX86_BUILTIN_MULPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_divph128_mask", IX86_BUILTIN_DIVPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_divph256_mask", IX86_BUILTIN_DIVPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_divph512_mask", IX86_BUILTIN_DIVPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask, "__builtin_ia32_addsh_mask", IX86_BUILTIN_ADDSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask, "__builtin_ia32_subsh_mask", IX86_BUILTIN_SUBSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask, "__builtin_ia32_mulsh_mask", IX86_BUILTIN_MULSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask, "__builtin_ia32_divsh_mask", IX86_BUILTIN_DIVSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv8hf3_mask, "__builtin_ia32_maxph128_mask", IX86_BUILTIN_MAXPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv16hf3_mask, "__builtin_ia32_maxph256_mask", IX86_BUILTIN_MAXPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask, "__builtin_ia32_maxph512_mask", IX86_BUILTIN_MAXPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv8hf3_mask, "__builtin_ia32_minph128_mask", IX86_BUILTIN_MINPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv16hf3_mask, "__builtin_ia32_minph256_mask", IX86_BUILTIN_MINPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask, "__builtin_ia32_minph512_mask", IX86_BUILTIN_MINPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask, "__builtin_ia32_maxsh_mask", IX86_BUILTIN_MAXSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask, "__builtin_ia32_minsh_mask", IX86_BUILTIN_MINSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_cmpv8hf3_mask, "__builtin_ia32_cmpph128_mask", IX86_BUILTIN_CMPPH128_MASK, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_cmpv16hf3_mask, "__builtin_ia32_cmpph256_mask", IX86_BUILTIN_CMPPH256_MASK, UNKNOWN, (int) UHI_FTYPE_V16HF_V16HF_INT_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask, "__builtin_ia32_cmpph512_mask", IX86_BUILTIN_CMPPH512_MASK, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_sqrtv8hf2_mask, "__builtin_ia32_sqrtph128_mask", IX86_BUILTIN_SQRTPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_sqrtv16hf2_mask, "__builtin_ia32_sqrtph256_mask", IX86_BUILTIN_SQRTPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rsqrtv8hf2_mask, "__builtin_ia32_rsqrtph128_mask", IX86_BUILTIN_RSQRTPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rsqrtv16hf2_mask, "__builtin_ia32_rsqrtph256_mask", IX86_BUILTIN_RSQRTPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rsqrtv32hf2_mask, "__builtin_ia32_rsqrtph512_mask", IX86_BUILTIN_RSQRTPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmrsqrtv8hf2_mask, "__builtin_ia32_rsqrtsh_mask", IX86_BUILTIN_RSQRTSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rcpv8hf2_mask, "__builtin_ia32_rcpph128_mask", IX86_BUILTIN_RCPPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rcpv16hf2_mask, "__builtin_ia32_rcpph256_mask", IX86_BUILTIN_RCPPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rcpv32hf2_mask, "__builtin_ia32_rcpph512_mask", IX86_BUILTIN_RCPPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmrcpv8hf2_mask, "__builtin_ia32_rcpsh_mask", IX86_BUILTIN_RCPSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_scalefv8hf_mask, "__builtin_ia32_scalefph128_mask", IX86_BUILTIN_SCALEFPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_scalefv16hf_mask, "__builtin_ia32_scalefph256_mask", IX86_BUILTIN_SCALEFPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducepv8hf_mask, "__builtin_ia32_reduceph128_mask", IX86_BUILTIN_REDUCEPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducepv16hf_mask, "__builtin_ia32_reduceph256_mask", IX86_BUILTIN_REDUCEPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf_mask, "__builtin_ia32_rndscaleph128_mask", IX86_BUILTIN_RNDSCALEPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf_mask, "__builtin_ia32_rndscaleph256_mask", IX86_BUILTIN_RNDSCALEPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_fpclassv16hf_mask, "__builtin_ia32_fpclassph256_mask", IX86_BUILTIN_FPCLASSPH256, UNKNOWN, (int) HI_FTYPE_V16HF_INT_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_fpclassv8hf_mask, "__builtin_ia32_fpclassph128_mask", IX86_BUILTIN_FPCLASSPH128, UNKNOWN, (int) QI_FTYPE_V8HF_INT_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_fpclassv32hf_mask, "__builtin_ia32_fpclassph512_mask", IX86_BUILTIN_FPCLASSPH512, UNKNOWN, (int) SI_FTYPE_V32HF_INT_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_vmfpclassv8hf_mask, "__builtin_ia32_fpclasssh_mask", IX86_BUILTIN_FPCLASSSH_MASK, UNKNOWN, (int) QI_FTYPE_V8HF_INT_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_getexpv16hf_mask, "__builtin_ia32_getexpph256_mask", IX86_BUILTIN_GETEXPPH256, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_getexpv8hf_mask, "__builtin_ia32_getexpph128_mask", IX86_BUILTIN_GETEXPPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_getmantv16hf_mask, "__builtin_ia32_getmantph256_mask", IX86_BUILTIN_GETMANTPH256, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_getmantv8hf_mask, "__builtin_ia32_getmantph128_mask", IX86_BUILTIN_GETMANTPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_movhf_mask, "__builtin_ia32_vmovsh_mask", IX86_BUILTIN_VMOVSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2dq_v4si_mask, "__builtin_ia32_vcvtph2dq128_mask", IX86_BUILTIN_VCVTPH2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2dq_v8si_mask, "__builtin_ia32_vcvtph2dq256_mask", IX86_BUILTIN_VCVTPH2DQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2udq_v4si_mask, "__builtin_ia32_vcvtph2udq128_mask", IX86_BUILTIN_VCVTPH2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2udq_v8si_mask, "__builtin_ia32_vcvtph2udq256_mask", IX86_BUILTIN_VCVTPH2UDQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv4si2_mask, "__builtin_ia32_vcvttph2dq128_mask", IX86_BUILTIN_VCVTTPH2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv8si2_mask, "__builtin_ia32_vcvttph2dq256_mask", IX86_BUILTIN_VCVTTPH2DQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv4si2_mask, "__builtin_ia32_vcvttph2udq128_mask", IX86_BUILTIN_VCVTTPH2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv8si2_mask, "__builtin_ia32_vcvttph2udq256_mask", IX86_BUILTIN_VCVTTPH2UDQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2qq_v2di_mask, "__builtin_ia32_vcvtph2qq128_mask", IX86_BUILTIN_VCVTPH2QQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2qq_v4di_mask, "__builtin_ia32_vcvtph2qq256_mask", IX86_BUILTIN_VCVTPH2QQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uqq_v2di_mask, "__builtin_ia32_vcvtph2uqq128_mask", IX86_BUILTIN_VCVTPH2UQQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uqq_v4di_mask, "__builtin_ia32_vcvtph2uqq256_mask", IX86_BUILTIN_VCVTPH2UQQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv2di2_mask, "__builtin_ia32_vcvttph2qq128_mask", IX86_BUILTIN_VCVTTPH2QQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv4di2_mask, "__builtin_ia32_vcvttph2qq256_mask", IX86_BUILTIN_VCVTTPH2QQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv2di2_mask, "__builtin_ia32_vcvttph2uqq128_mask", IX86_BUILTIN_VCVTTPH2UQQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv4di2_mask, "__builtin_ia32_vcvttph2uqq256_mask", IX86_BUILTIN_VCVTTPH2UQQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2w_v8hi_mask, "__builtin_ia32_vcvtph2w128_mask", IX86_BUILTIN_VCVTPH2W128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2w_v16hi_mask, "__builtin_ia32_vcvtph2w256_mask", IX86_BUILTIN_VCVTPH2W256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uw_v8hi_mask, "__builtin_ia32_vcvtph2uw128_mask", IX86_BUILTIN_VCVTPH2UW128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uw_v16hi_mask, "__builtin_ia32_vcvtph2uw256_mask", IX86_BUILTIN_VCVTPH2UW256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv8hi2_mask, "__builtin_ia32_vcvttph2w128_mask", IX86_BUILTIN_VCVTTPH2W128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv16hi2_mask, "__builtin_ia32_vcvttph2w256_mask", IX86_BUILTIN_VCVTTPH2W256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv8hi2_mask, "__builtin_ia32_vcvttph2uw128_mask", IX86_BUILTIN_VCVTTPH2UW128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv16hi2_mask, "__builtin_ia32_vcvttph2uw256_mask", IX86_BUILTIN_VCVTTPH2UW256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtw2ph_v8hi_mask, "__builtin_ia32_vcvtw2ph128_mask", IX86_BUILTIN_VCVTW2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtw2ph_v16hi_mask, "__builtin_ia32_vcvtw2ph256_mask", IX86_BUILTIN_VCVTW2PH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HI_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuw2ph_v8hi_mask, "__builtin_ia32_vcvtuw2ph128_mask", IX86_BUILTIN_VCVTUW2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuw2ph_v16hi_mask, "__builtin_ia32_vcvtuw2ph256_mask", IX86_BUILTIN_VCVTUW2PH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HI_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtdq2ph_v4si_mask, "__builtin_ia32_vcvtdq2ph128_mask", IX86_BUILTIN_VCVTDQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V4SI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtdq2ph_v8si_mask, "__builtin_ia32_vcvtdq2ph256_mask", IX86_BUILTIN_VCVTDQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V8SI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtudq2ph_v4si_mask, "__builtin_ia32_vcvtudq2ph128_mask", IX86_BUILTIN_VCVTUDQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V4SI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtudq2ph_v8si_mask, "__builtin_ia32_vcvtudq2ph256_mask", IX86_BUILTIN_VCVTUDQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V8SI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtqq2ph_v2di_mask, "__builtin_ia32_vcvtqq2ph128_mask", IX86_BUILTIN_VCVTQQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V2DI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtqq2ph_v4di_mask, "__builtin_ia32_vcvtqq2ph256_mask", IX86_BUILTIN_VCVTQQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V4DI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuqq2ph_v2di_mask, "__builtin_ia32_vcvtuqq2ph128_mask", IX86_BUILTIN_VCVTUQQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V2DI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuqq2ph_v4di_mask, "__builtin_ia32_vcvtuqq2ph256_mask", IX86_BUILTIN_VCVTUQQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V4DI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv2df2_mask, "__builtin_ia32_vcvtph2pd128_mask", IX86_BUILTIN_VCVTPH2PD128_MASK, UNKNOWN, (int) V2DF_FTYPE_V8HF_V2DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv4df2_mask, "__builtin_ia32_vcvtph2pd256_mask", IX86_BUILTIN_VCVTPH2PD256_MASK, UNKNOWN, (int) V4DF_FTYPE_V8HF_V4DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv4sf2_mask, "__builtin_ia32_vcvtph2psx128_mask", IX86_BUILTIN_VCVTPH2PSX128_MASK, UNKNOWN, (int) V4SF_FTYPE_V8HF_V4SF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv8sf2_mask, "__builtin_ia32_vcvtph2psx256_mask", IX86_BUILTIN_VCVTPH2PSX256_MASK, UNKNOWN, (int) V8SF_FTYPE_V8HF_V8SF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtps2ph_v4sf_mask, "__builtin_ia32_vcvtps2phx128_mask", IX86_BUILTIN_VCVTPS2PHX128_MASK, UNKNOWN, (int) V8HF_FTYPE_V4SF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtps2ph_v8sf_mask, "__builtin_ia32_vcvtps2phx256_mask", IX86_BUILTIN_VCVTPS2PHX256_MASK, UNKNOWN, (int) V8HF_FTYPE_V8SF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtpd2ph_v2df_mask, "__builtin_ia32_vcvtpd2ph128_mask", IX86_BUILTIN_VCVTPD2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V2DF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtpd2ph_v4df_mask, "__builtin_ia32_vcvtpd2ph256_mask", IX86_BUILTIN_VCVTPD2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V4DF_V8HF_UQI)
 
 /* Builtins with rounding support.  */
 BDESC_END (ARGS, ROUND_ARGS)
@@ -3003,20 +3076,70 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv16sf_mask_round, "_
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv8df_mask_round, "__builtin_ia32_rangepd512_mask", IX86_BUILTIN_RANGEPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT)
 
 /* AVX512FP16.  */
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_vaddph_v32hf_mask_round", IX86_BUILTIN_VADDPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_vsubph_v32hf_mask_round", IX86_BUILTIN_VSUBPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_vmulph_v32hf_mask_round", IX86_BUILTIN_VMULPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_vdivph_v32hf_mask_round", IX86_BUILTIN_VDIVPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask_round, "__builtin_ia32_vaddsh_v8hf_mask_round", IX86_BUILTIN_VADDSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask_round, "__builtin_ia32_vsubsh_v8hf_mask_round", IX86_BUILTIN_VSUBSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask_round, "__builtin_ia32_vmulsh_v8hf_mask_round", IX86_BUILTIN_VMULSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask_round, "__builtin_ia32_vdivsh_v8hf_mask_round", IX86_BUILTIN_VDIVSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask_round, "__builtin_ia32_vmaxph_v32hf_mask_round", IX86_BUILTIN_VMAXPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask_round, "__builtin_ia32_vminph_v32hf_mask_round", IX86_BUILTIN_VMINPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask_round, "__builtin_ia32_vmaxsh_v8hf_mask_round", IX86_BUILTIN_VMAXSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask_round, "__builtin_ia32_vminsh_v8hf_mask_round", IX86_BUILTIN_VMINSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask_round, "__builtin_ia32_vcmpph_v32hf_mask_round", IX86_BUILTIN_VCMPPH_V32HF_MASK_ROUND, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmcmpv8hf3_mask_round, "__builtin_ia32_vcmpsh_v8hf_mask_round", IX86_BUILTIN_VCMPSH_V8HF_MASK_ROUND, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_addph512_mask_round", IX86_BUILTIN_ADDPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_subph512_mask_round", IX86_BUILTIN_SUBPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_mulph512_mask_round", IX86_BUILTIN_MULPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_divph512_mask_round", IX86_BUILTIN_DIVPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask_round, "__builtin_ia32_addsh_mask_round", IX86_BUILTIN_ADDSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask_round, "__builtin_ia32_subsh_mask_round", IX86_BUILTIN_SUBSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask_round, "__builtin_ia32_mulsh_mask_round", IX86_BUILTIN_MULSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask_round, "__builtin_ia32_divsh_mask_round", IX86_BUILTIN_DIVSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask_round, "__builtin_ia32_maxph512_mask_round", IX86_BUILTIN_MAXPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask_round, "__builtin_ia32_minph512_mask_round", IX86_BUILTIN_MINPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask_round, "__builtin_ia32_maxsh_mask_round", IX86_BUILTIN_MAXSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask_round, "__builtin_ia32_minsh_mask_round", IX86_BUILTIN_MINSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask_round, "__builtin_ia32_cmpph512_mask_round", IX86_BUILTIN_CMPPH512_MASK_ROUND, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmcmpv8hf3_mask_round, "__builtin_ia32_cmpsh_mask_round", IX86_BUILTIN_CMPSH_MASK_ROUND, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_sqrtv32hf2_mask_round, "__builtin_ia32_sqrtph512_mask_round", IX86_BUILTIN_SQRTPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsqrtv8hf2_mask_round, "__builtin_ia32_sqrtsh_mask_round", IX86_BUILTIN_SQRTSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_scalefv32hf_mask_round, "__builtin_ia32_scalefph512_mask_round", IX86_BUILTIN_SCALEFPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmscalefv8hf_mask_round, "__builtin_ia32_scalefsh_mask_round", IX86_BUILTIN_SCALEFSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducepv32hf_mask_round, "__builtin_ia32_reduceph512_mask_round", IX86_BUILTIN_REDUCEPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_INT_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducesv8hf_mask_round, "__builtin_ia32_reducesh_mask_round", IX86_BUILTIN_REDUCESH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf_mask_round, "__builtin_ia32_rndscaleph512_mask_round", IX86_BUILTIN_RNDSCALEPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_INT_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_rndscalev8hf_mask_round, "__builtin_ia32_rndscalesh_mask_round", IX86_BUILTIN_RNDSCALESH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_getexpv32hf_mask_round, "__builtin_ia32_getexpph512_mask", IX86_BUILTIN_GETEXPPH512, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_sgetexpv8hf_mask_round, "__builtin_ia32_getexpsh_mask_round", IX86_BUILTIN_GETEXPSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_getmantv32hf_mask_round, "__builtin_ia32_getmantph512_mask", IX86_BUILTIN_GETMANTPH512, UNKNOWN, (int) V32HF_FTYPE_V32HF_INT_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vgetmantv8hf_mask_round, "__builtin_ia32_getmantsh_mask_round", IX86_BUILTIN_GETMANTSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2dq_v16si_mask_round, "__builtin_ia32_vcvtph2dq512_mask_round", IX86_BUILTIN_VCVTPH2DQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2udq_v16si_mask_round, "__builtin_ia32_vcvtph2udq512_mask_round", IX86_BUILTIN_VCVTPH2UDQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv16si2_mask_round, "__builtin_ia32_vcvttph2dq512_mask_round", IX86_BUILTIN_VCVTTPH2DQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv16si2_mask_round, "__builtin_ia32_vcvttph2udq512_mask_round", IX86_BUILTIN_VCVTTPH2UDQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2qq_v8di_mask_round, "__builtin_ia32_vcvtph2qq512_mask_round", IX86_BUILTIN_VCVTPH2QQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uqq_v8di_mask_round, "__builtin_ia32_vcvtph2uqq512_mask_round", IX86_BUILTIN_VCVTPH2UQQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv8di2_mask_round, "__builtin_ia32_vcvttph2qq512_mask_round", IX86_BUILTIN_VCVTTPH2QQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv8di2_mask_round, "__builtin_ia32_vcvttph2uqq512_mask_round", IX86_BUILTIN_VCVTTPH2UQQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2w_v32hi_mask_round, "__builtin_ia32_vcvtph2w512_mask_round", IX86_BUILTIN_VCVTPH2W512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uw_v32hi_mask_round, "__builtin_ia32_vcvtph2uw512_mask_round", IX86_BUILTIN_VCVTPH2UW512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv32hi2_mask_round, "__builtin_ia32_vcvttph2w512_mask_round", IX86_BUILTIN_VCVTTPH2W512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv32hi2_mask_round, "__builtin_ia32_vcvttph2uw512_mask_round", IX86_BUILTIN_VCVTTPH2UW512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtw2ph_v32hi_mask_round, "__builtin_ia32_vcvtw2ph512_mask_round", IX86_BUILTIN_VCVTW2PH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HI_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuw2ph_v32hi_mask_round, "__builtin_ia32_vcvtuw2ph512_mask_round", IX86_BUILTIN_VCVTUW2PH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HI_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtdq2ph_v16si_mask_round, "__builtin_ia32_vcvtdq2ph512_mask_round", IX86_BUILTIN_VCVTDQ2PH512_MASK_ROUND, UNKNOWN, (int) V16HF_FTYPE_V16SI_V16HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtudq2ph_v16si_mask_round, "__builtin_ia32_vcvtudq2ph512_mask_round", IX86_BUILTIN_VCVTUDQ2PH512_MASK_ROUND, UNKNOWN, (int) V16HF_FTYPE_V16SI_V16HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtqq2ph_v8di_mask_round, "__builtin_ia32_vcvtqq2ph512_mask_round", IX86_BUILTIN_VCVTQQ2PH512_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8DI_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuqq2ph_v8di_mask_round, "__builtin_ia32_vcvtuqq2ph512_mask_round", IX86_BUILTIN_VCVTUQQ2PH512_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8DI_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2si_round, "__builtin_ia32_vcvtsh2si32_round", IX86_BUILTIN_VCVTSH2SI32_ROUND, UNKNOWN, (int) INT_FTYPE_V8HF_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2siq_round, "__builtin_ia32_vcvtsh2si64_round", IX86_BUILTIN_VCVTSH2SI64_ROUND, UNKNOWN, (int) INT64_FTYPE_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2usi_round, "__builtin_ia32_vcvtsh2usi32_round", IX86_BUILTIN_VCVTSH2USI32_ROUND, UNKNOWN, (int) UINT_FTYPE_V8HF_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2usiq_round, "__builtin_ia32_vcvtsh2usi64_round", IX86_BUILTIN_VCVTSH2USI64_ROUND, UNKNOWN, (int) UINT64_FTYPE_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncsi2_round, "__builtin_ia32_vcvttsh2si32_round", IX86_BUILTIN_VCVTTSH2SI32_ROUND, UNKNOWN, (int) INT_FTYPE_V8HF_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncdi2_round, "__builtin_ia32_vcvttsh2si64_round", IX86_BUILTIN_VCVTTSH2SI64_ROUND, UNKNOWN, (int) INT64_FTYPE_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncsi2_round, "__builtin_ia32_vcvttsh2usi32_round", IX86_BUILTIN_VCVTTSH2USI32_ROUND, UNKNOWN, (int) UINT_FTYPE_V8HF_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncdi2_round, "__builtin_ia32_vcvttsh2usi64_round", IX86_BUILTIN_VCVTTSH2USI64_ROUND, UNKNOWN, (int) UINT64_FTYPE_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsi2sh_round, "__builtin_ia32_vcvtsi2sh32_round", IX86_BUILTIN_VCVTSI2SH32_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsi2shq_round, "__builtin_ia32_vcvtsi2sh64_round", IX86_BUILTIN_VCVTSI2SH64_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT64_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtusi2sh_round, "__builtin_ia32_vcvtusi2sh32_round", IX86_BUILTIN_VCVTUSI2SH32_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_UINT_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtusi2shq_round, "__builtin_ia32_vcvtusi2sh64_round", IX86_BUILTIN_VCVTUSI2SH64_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_UINT64_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv8df2_mask_round, "__builtin_ia32_vcvtph2pd512_mask_round", IX86_BUILTIN_VCVTPH2PD512_MASK_ROUND, UNKNOWN, (int) V8DF_FTYPE_V8HF_V8DF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv16sf2_mask_round, "__builtin_ia32_vcvtph2psx512_mask_round", IX86_BUILTIN_VCVTPH2PSX512_MASK_ROUND, UNKNOWN, (int) V16SF_FTYPE_V16HF_V16SF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtpd2ph_v8df_mask_round, "__builtin_ia32_vcvtpd2ph512_mask_round", IX86_BUILTIN_VCVTPD2PH512_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8DF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtps2ph_v16sf_mask_round, "__builtin_ia32_vcvtps2phx512_mask_round", IX86_BUILTIN_VCVTPS2PHX512_MASK_ROUND, UNKNOWN, (int) V16HF_FTYPE_V16SF_V16HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2ss_mask_round, "__builtin_ia32_vcvtsh2ss_mask_round", IX86_BUILTIN_VCVTSH2SS_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2sd_mask_round, "__builtin_ia32_vcvtsh2sd_mask_round", IX86_BUILTIN_VCVTSH2SD_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtss2sh_mask_round, "__builtin_ia32_vcvtss2sh_mask_round", IX86_BUILTIN_VCVTSS2SH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsd2sh_mask_round, "__builtin_ia32_vcvtsd2sh_mask_round", IX86_BUILTIN_VCVTSD2SH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT)
 
 BDESC_END (ROUND_ARGS, MULTI_ARG)
 
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index e117afb..bfafd15 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -9710,6 +9710,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16HI_FTYPE_V16SI_V16HI_UHI:
     case V16QI_FTYPE_V16SI_V16QI_UHI:
     case V16QI_FTYPE_V8DI_V16QI_UQI:
+    case V32HF_FTYPE_V32HF_V32HF_USI:
     case V16SF_FTYPE_V16SF_V16SF_UHI:
     case V16SF_FTYPE_V4SF_V16SF_UHI:
     case V16SI_FTYPE_SI_V16SI_UHI:
@@ -9739,20 +9740,40 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16HI_FTYPE_HI_V16HI_UHI:
     case V8HI_FTYPE_V8HI_V8HI_UQI:
     case V8HI_FTYPE_HI_V8HI_UQI:
+    case V16HF_FTYPE_V16HF_V16HF_UHI:
     case V8SF_FTYPE_V8HI_V8SF_UQI:
     case V4SF_FTYPE_V8HI_V4SF_UQI:
+    case V8SI_FTYPE_V8HF_V8SI_UQI:
+    case V8SF_FTYPE_V8HF_V8SF_UQI:
     case V8SI_FTYPE_V8SF_V8SI_UQI:
     case V4SI_FTYPE_V4SF_V4SI_UQI:
+    case V4SI_FTYPE_V8HF_V4SI_UQI:
+    case V4SF_FTYPE_V8HF_V4SF_UQI:
+    case V4DI_FTYPE_V8HF_V4DI_UQI:
     case V4DI_FTYPE_V4SF_V4DI_UQI:
+    case V2DI_FTYPE_V8HF_V2DI_UQI:
     case V2DI_FTYPE_V4SF_V2DI_UQI:
+    case V8HF_FTYPE_V8HF_V8HF_UQI:
+    case V8HF_FTYPE_V8HI_V8HF_UQI:
+    case V8HF_FTYPE_V8SI_V8HF_UQI:
+    case V8HF_FTYPE_V8SF_V8HF_UQI:
+    case V8HF_FTYPE_V4SI_V8HF_UQI:
+    case V8HF_FTYPE_V4SF_V8HF_UQI:
+    case V8HF_FTYPE_V4DI_V8HF_UQI:
+    case V8HF_FTYPE_V4DF_V8HF_UQI:
+    case V8HF_FTYPE_V2DI_V8HF_UQI:
+    case V8HF_FTYPE_V2DF_V8HF_UQI:
     case V4SF_FTYPE_V4DI_V4SF_UQI:
     case V4SF_FTYPE_V2DI_V4SF_UQI:
     case V4DF_FTYPE_V4DI_V4DF_UQI:
+    case V4DF_FTYPE_V8HF_V4DF_UQI:
+    case V2DF_FTYPE_V8HF_V2DF_UQI:
     case V2DF_FTYPE_V2DI_V2DF_UQI:
     case V16QI_FTYPE_V8HI_V16QI_UQI:
     case V16QI_FTYPE_V16HI_V16QI_UHI:
     case V16QI_FTYPE_V4SI_V16QI_UQI:
     case V16QI_FTYPE_V8SI_V16QI_UQI:
+    case V8HI_FTYPE_V8HF_V8HI_UQI:
     case V8HI_FTYPE_V4SI_V8HI_UQI:
     case V8HI_FTYPE_V8SI_V8HI_UQI:
     case V16QI_FTYPE_V2DI_V16QI_UQI:
@@ -9810,6 +9831,8 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V8DI_FTYPE_DI_V8DI_UQI:
     case V16SF_FTYPE_V8SF_V16SF_UHI:
     case V16SI_FTYPE_V8SI_V16SI_UHI:
+    case V16HF_FTYPE_V16HI_V16HF_UHI:
+    case V16HI_FTYPE_V16HF_V16HI_UHI:
     case V16HI_FTYPE_V16HI_V16HI_UHI:
     case V8HI_FTYPE_V16QI_V8HI_UQI:
     case V16HI_FTYPE_V16QI_V16HI_UHI:
@@ -9910,6 +9933,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case HI_FTYPE_V16SF_INT_UHI:
     case QI_FTYPE_V8SF_INT_UQI:
     case QI_FTYPE_V4SF_INT_UQI:
+    case QI_FTYPE_V8HF_INT_UQI:
+    case HI_FTYPE_V16HF_INT_UHI:
+    case SI_FTYPE_V32HF_INT_USI:
     case V4SI_FTYPE_V4SI_V4SI_UHI:
     case V8SI_FTYPE_V8SI_V8SI_UHI:
       nargs = 3;
@@ -10058,6 +10084,8 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
     case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
     case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
+    case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
+    case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
     case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
     case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
     case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
@@ -10229,8 +10257,10 @@ ix86_expand_args_builtin (const struct builtin_description *d,
 	      case CODE_FOR_avx_vpermilv4df_mask:
 	      case CODE_FOR_avx512f_getmantv8df_mask:
 	      case CODE_FOR_avx512f_getmantv16sf_mask:
+	      case CODE_FOR_avx512vl_getmantv16hf_mask:
 	      case CODE_FOR_avx512vl_getmantv8sf_mask:
 	      case CODE_FOR_avx512vl_getmantv4df_mask:
+	      case CODE_FOR_avx512fp16_getmantv8hf_mask:
 	      case CODE_FOR_avx512vl_getmantv4sf_mask:
 	      case CODE_FOR_avx512vl_getmantv2df_mask:
 	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
@@ -10645,16 +10675,24 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     {
     case UINT64_FTYPE_V2DF_INT:
     case UINT64_FTYPE_V4SF_INT:
+    case UINT64_FTYPE_V8HF_INT:
     case UINT_FTYPE_V2DF_INT:
     case UINT_FTYPE_V4SF_INT:
+    case UINT_FTYPE_V8HF_INT:
     case INT64_FTYPE_V2DF_INT:
     case INT64_FTYPE_V4SF_INT:
+    case INT64_FTYPE_V8HF_INT:
     case INT_FTYPE_V2DF_INT:
     case INT_FTYPE_V4SF_INT:
+    case INT_FTYPE_V8HF_INT:
       nargs = 2;
       break;
     case V32HF_FTYPE_V32HF_V32HF_INT:
     case V8HF_FTYPE_V8HF_V8HF_INT:
+    case V8HF_FTYPE_V8HF_INT_INT:
+    case V8HF_FTYPE_V8HF_UINT_INT:
+    case V8HF_FTYPE_V8HF_INT64_INT:
+    case V8HF_FTYPE_V8HF_UINT64_INT:
     case V4SF_FTYPE_V4SF_UINT_INT:
     case V4SF_FTYPE_V4SF_UINT64_INT:
     case V2DF_FTYPE_V2DF_UINT64_INT:
@@ -10669,18 +10707,29 @@ ix86_expand_round_builtin (const struct builtin_description *d,
       break;
     case V8SF_FTYPE_V8DF_V8SF_QI_INT:
     case V8DF_FTYPE_V8DF_V8DF_QI_INT:
+    case V32HI_FTYPE_V32HF_V32HI_USI_INT:
     case V8SI_FTYPE_V8DF_V8SI_QI_INT:
+    case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
     case V8DI_FTYPE_V8DF_V8DI_QI_INT:
     case V8SF_FTYPE_V8DI_V8SF_QI_INT:
     case V8DF_FTYPE_V8DI_V8DF_QI_INT:
+    case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
+    case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
+    case V32HF_FTYPE_V32HI_V32HF_USI_INT:
+    case V32HF_FTYPE_V32HF_V32HF_USI_INT:
     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
     case V16SI_FTYPE_V16SF_V16SI_HI_INT:
+    case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
+    case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
     case V8DF_FTYPE_V8SF_V8DF_QI_INT:
     case V16SF_FTYPE_V16HI_V16SF_HI_INT:
     case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
     case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
+    case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
+    case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
+    case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
       nargs = 4;
       break;
     case V4SF_FTYPE_V4SF_V4SF_INT_INT:
@@ -10694,8 +10743,10 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
+    case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
+    case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
@@ -10703,8 +10754,11 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
     case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
+    case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
+    case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
       nargs = 5;
       break;
+    case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
     case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
     case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
@@ -10727,6 +10781,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
+    case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
       nargs = 6;
       nargs_constant = 4;
       break;
@@ -10763,10 +10818,12 @@ ix86_expand_round_builtin (const struct builtin_description *d,
 		{
 		case CODE_FOR_avx512f_getmantv8df_mask_round:
 		case CODE_FOR_avx512f_getmantv16sf_mask_round:
+		case CODE_FOR_avx512bw_getmantv32hf_mask_round:
 		case CODE_FOR_avx512f_vgetmantv2df_round:
 		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
 		case CODE_FOR_avx512f_vgetmantv4sf_round:
 		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
+		case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
 		  error ("the immediate argument must be a 4-bit immediate");
 		  return const0_rtx;
 		case CODE_FOR_avx512f_cmpv8df3_mask_round:
@@ -11070,6 +11127,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
     case VOID_FTYPE_PFLOAT_V16SF_UHI:
     case VOID_FTYPE_PFLOAT_V8SF_UQI:
     case VOID_FTYPE_PFLOAT_V4SF_UQI:
+    case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
     case VOID_FTYPE_PV32QI_V32HI_USI:
     case VOID_FTYPE_PV16QI_V16HI_UHI:
     case VOID_FTYPE_PUDI_V8HI_UQI:
@@ -11142,6 +11200,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
     case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
     case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
     case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
+    case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
       nargs = 3;
       klass = load;
       memory = 0;
@@ -14054,7 +14113,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	  tmp1 = gen_reg_rtx (SImode);
 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
 
-	  /* Insert the SImode value as low element of a V4SImode vector. */
+	  /* Insert the SImode value as low element of a V4SImode vector.  */
 	  tmp2 = gen_reg_rtx (V4SImode);
 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
 	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
@@ -14179,6 +14238,8 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
       break;
     case E_V8HImode:
       use_vector_set = TARGET_SSE2;
+      gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
+	? gen_vec_setv8hi_0 : NULL;
       break;
     case E_V8QImode:
       use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
@@ -14190,8 +14251,12 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
       use_vector_set = TARGET_SSE4_1;
       break;
     case E_V32QImode:
+      use_vector_set = TARGET_AVX;
+      break;
     case E_V16HImode:
       use_vector_set = TARGET_AVX;
+      gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
+	? gen_vec_setv16hi_0 : NULL;
       break;
     case E_V8SImode:
       use_vector_set = TARGET_AVX;
@@ -14239,6 +14304,9 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
       gen_vec_set_0 = gen_vec_setv32hf_0;
       break;
+    case E_V32HImode:
+      use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv32hi_0;
     default:
       break;
     }
@@ -14638,7 +14706,7 @@ ix86_expand_vector_init_interleave (machine_mode mode,
   switch (mode)
     {
     case E_V8HFmode:
-      gen_load_even = gen_vec_setv8hf;
+      gen_load_even = gen_vec_interleave_lowv8hf;
       gen_interleave_first_low = gen_vec_interleave_lowv4si;
       gen_interleave_second_low = gen_vec_interleave_lowv2di;
       inner_mode = HFmode;
@@ -14673,35 +14741,40 @@ ix86_expand_vector_init_interleave (machine_mode mode,
       op = ops [i + i];
       if (inner_mode == HFmode)
 	{
-	  /* Convert HFmode to HImode.  */
-	  op1 = gen_reg_rtx (HImode);
-	  op1 = gen_rtx_SUBREG (HImode, force_reg (HFmode, op), 0);
-	  op = gen_reg_rtx (HImode);
-	  emit_move_insn (op, op1);
+	  rtx even, odd;
+	  /* Use vpuncklwd to pack 2 HFmode.  */
+	  op0 = gen_reg_rtx (V8HFmode);
+	  even = lowpart_subreg (V8HFmode, force_reg (HFmode, op), HFmode);
+	  odd = lowpart_subreg (V8HFmode,
+				force_reg (HFmode, ops[i + i + 1]),
+				HFmode);
+	  emit_insn (gen_load_even (op0, even, odd));
 	}
+      else
+	{
+	  /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
+	  op0 = gen_reg_rtx (SImode);
+	  emit_move_insn (op0, gen_lowpart (SImode, op));
 
-      /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
-      op0 = gen_reg_rtx (SImode);
-      emit_move_insn (op0, gen_lowpart (SImode, op));
-
-      /* Insert the SImode value as low element of V4SImode vector. */
-      op1 = gen_reg_rtx (V4SImode);
-      op0 = gen_rtx_VEC_MERGE (V4SImode,
-			       gen_rtx_VEC_DUPLICATE (V4SImode,
-						      op0),
-			       CONST0_RTX (V4SImode),
-			       const1_rtx);
-      emit_insn (gen_rtx_SET (op1, op0));
+	  /* Insert the SImode value as low element of V4SImode vector.  */
+	  op1 = gen_reg_rtx (V4SImode);
+	  op0 = gen_rtx_VEC_MERGE (V4SImode,
+				   gen_rtx_VEC_DUPLICATE (V4SImode,
+							  op0),
+				   CONST0_RTX (V4SImode),
+				   const1_rtx);
+	  emit_insn (gen_rtx_SET (op1, op0));
 
-      /* Cast the V4SImode vector back to a vector in orignal mode.  */
-      op0 = gen_reg_rtx (mode);
-      emit_move_insn (op0, gen_lowpart (mode, op1));
+	  /* Cast the V4SImode vector back to a vector in orignal mode.  */
+	  op0 = gen_reg_rtx (mode);
+	  emit_move_insn (op0, gen_lowpart (mode, op1));
 
-      /* Load even elements into the second position.  */
-      emit_insn (gen_load_even (op0,
-				force_reg (inner_mode,
-					   ops [i + i + 1]),
-				const1_rtx));
+	  /* Load even elements into the second position.  */
+	  emit_insn (gen_load_even (op0,
+				    force_reg (inner_mode,
+					       ops[i + i + 1]),
+				    const1_rtx));
+	}
 
       /* Cast vector to FIRST_IMODE vector.  */
       ops[i] = gen_reg_rtx (first_imode);
@@ -15182,6 +15255,7 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
   machine_mode inner_mode = GET_MODE_INNER (mode);
   machine_mode half_mode;
   bool use_vec_merge = false;
+  bool blendm_const = false;
   rtx tmp;
   static rtx (*gen_extract[7][2]) (rtx, rtx)
     = {
@@ -15369,7 +15443,14 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
       return;
 
     case E_V8HFmode:
-      use_vec_merge = true;
+      if (TARGET_AVX2)
+	{
+	  mmode = SImode;
+	  gen_blendm = gen_sse4_1_pblendph;
+	  blendm_const = true;
+	}
+      else
+	use_vec_merge = true;
       break;
 
     case E_V8HImode:
@@ -15396,10 +15477,20 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
       goto half;
 
     case E_V16HFmode:
-      half_mode = V8HFmode;
-      j = 6;
-      n = 8;
-      goto half;
+      if (TARGET_AVX2)
+	{
+	  mmode = SImode;
+	  gen_blendm = gen_avx2_pblendph;
+	  blendm_const = true;
+	  break;
+	}
+      else
+	{
+	  half_mode = V8HFmode;
+	  j = 6;
+	  n = 8;
+	  goto half;
+	}
 
     case E_V16HImode:
       half_mode = V8HImode;
@@ -15560,15 +15651,15 @@ quarter:
     {
       tmp = gen_reg_rtx (mode);
       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
+      rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
       /* The avx512*_blendm<mode> expanders have different operand order
 	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
 	 elements where the mask is set and second input operand otherwise,
 	 in {sse,avx}*_*blend* the first input operand is used for elements
 	 where the mask is clear and second input operand otherwise.  */
-      emit_insn (gen_blendm (target, target, tmp,
-			     force_reg (mmode,
-					gen_int_mode (HOST_WIDE_INT_1U << elt,
-						      mmode))));
+      if (!blendm_const)
+	merge_mask = force_reg (mmode, merge_mask);
+      emit_insn (gen_blendm (target, target, tmp, merge_mask));
     }
   else if (use_vec_merge)
     {
diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 5a99ea7..a525a83 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -2210,15 +2210,34 @@ remove_partial_avx_dependency (void)
 	      != AVX_PARTIAL_XMM_UPDATE_TRUE)
 	    continue;
 
-	  if (!v4sf_const0)
-	    v4sf_const0 = gen_reg_rtx (V4SFmode);
-
 	  /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
 	     SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
 	     vec_merge with subreg.  */
 	  rtx src = SET_SRC (set);
 	  rtx dest = SET_DEST (set);
 	  machine_mode dest_mode = GET_MODE (dest);
+	  machine_mode src_mode = GET_MODE (XEXP (src, 0));
+
+	  switch (src_mode)
+	    {
+	    case E_SFmode:
+	    case E_DFmode:
+	      if (TARGET_USE_VECTOR_FP_CONVERTS
+		  || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
+		continue;
+	      break;
+	    case E_SImode:
+	    case E_DImode:
+	      if (TARGET_USE_VECTOR_CONVERTS
+		  || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
+		continue;
+	      break;
+	    default:
+	      break;
+	    }
+
+	  if (!v4sf_const0)
+	    v4sf_const0 = gen_reg_rtx (V4SFmode);
 
 	  rtx zero;
 	  machine_mode dest_vecmode;
diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index fcadfcd..2a2c8b8 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -90,6 +90,8 @@ VECTOR_MODES (FLOAT, 32);     /*   V16HF V8SF V4DF V2TF */
 VECTOR_MODES (FLOAT, 64);     /*  V32HF V16SF V8DF V4TF */
 VECTOR_MODES (FLOAT, 128);    /* V64HF V32SF V16DF V8TF */
 VECTOR_MODES (FLOAT, 256);    /* V128HF V64SF V32DF V16TF */
+VECTOR_MODE (FLOAT, HF, 2);   /* 	      	   V2HF */
+VECTOR_MODE (FLOAT, HF, 6);   /*		   V6HF */
 VECTOR_MODE (INT, TI, 1);     /*                   V1TI */
 VECTOR_MODE (INT, DI, 1);     /*                   V1DI */
 VECTOR_MODE (INT, SI, 1);     /*                   V1SI */
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index c0006b3..e7a3bd4 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -724,7 +724,7 @@ static const struct processor_costs *processor_cost_table[] =
   &slm_cost,
   &slm_cost,
   &slm_cost,
-  &slm_cost,
+  &tremont_cost,
   &slm_cost,
   &slm_cost,
   &skylake_cost,
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dcae34b..708834a 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -320,7 +320,7 @@ struct ix86_address
   addr_space_t seg;
 };
 
-extern int ix86_decompose_address (rtx, struct ix86_address *);
+extern bool ix86_decompose_address (rtx, struct ix86_address *);
 extern int memory_address_length (rtx, bool);
 extern void x86_output_aligned_bss (FILE *, tree, const char *,
 				    unsigned HOST_WIDE_INT, unsigned);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7b173bc..afc2674 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -10101,10 +10101,10 @@ ix86_live_on_entry (bitmap regs)
 }
 
 /* Extract the parts of an RTL expression that is a valid memory address
-   for an instruction.  Return 0 if the structure of the address is
+   for an instruction.  Return false if the structure of the address is
    grossly off.  */
 
-int
+bool
 ix86_decompose_address (rtx addr, struct ix86_address *out)
 {
   rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
@@ -10123,17 +10123,17 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
 	{
 	  addr = XEXP (addr, 0);
 	  if (CONST_INT_P (addr))
-	    return 0;
+	    return false;
 	}	      
       else if (GET_CODE (addr) == AND
 	       && const_32bit_mask (XEXP (addr, 1), DImode))
 	{
 	  addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
 	  if (addr == NULL_RTX)
-	    return 0;
+	    return false;
 
 	  if (CONST_INT_P (addr))
-	    return 0;
+	    return false;
 	}
       else if (GET_CODE (addr) == AND)
 	{
@@ -10167,7 +10167,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
 	{
 	  addr = SUBREG_REG (addr);
 	  if (CONST_INT_P (addr))
-	    return 0;
+	    return false;
 	}
     }
 
@@ -10178,7 +10178,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
       if (REG_P (SUBREG_REG (addr)))
 	base = addr;
       else
-	return 0;
+	return false;
     }
   else if (GET_CODE (addr) == PLUS)
     {
@@ -10189,13 +10189,13 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
       do
 	{
 	  if (n >= 4)
-	    return 0;
+	    return false;
 	  addends[n++] = XEXP (op, 1);
 	  op = XEXP (op, 0);
 	}
       while (GET_CODE (op) == PLUS);
       if (n >= 4)
-	return 0;
+	return false;
       addends[n] = op;
 
       for (i = n; i >= 0; --i)
@@ -10205,28 +10205,28 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
 	    {
 	    case MULT:
 	      if (index)
-		return 0;
+		return false;
 	      index = XEXP (op, 0);
 	      scale_rtx = XEXP (op, 1);
 	      break;
 
 	    case ASHIFT:
 	      if (index)
-		return 0;
+		return false;
 	      index = XEXP (op, 0);
 	      tmp = XEXP (op, 1);
 	      if (!CONST_INT_P (tmp))
-		return 0;
+		return false;
 	      scale = INTVAL (tmp);
 	      if ((unsigned HOST_WIDE_INT) scale > 3)
-		return 0;
+		return false;
 	      scale = 1 << scale;
 	      break;
 
 	    case ZERO_EXTEND:
 	      op = XEXP (op, 0);
 	      if (GET_CODE (op) != UNSPEC)
-		return 0;
+		return false;
 	      /* FALLTHRU */
 
 	    case UNSPEC:
@@ -10235,12 +10235,12 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
 	          && seg == ADDR_SPACE_GENERIC)
 		seg = DEFAULT_TLS_SEG_REG;
 	      else
-		return 0;
+		return false;
 	      break;
 
 	    case SUBREG:
 	      if (!REG_P (SUBREG_REG (op)))
-		return 0;
+		return false;
 	      /* FALLTHRU */
 
 	    case REG:
@@ -10249,7 +10249,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
 	      else if (!index)
 		index = op;
 	      else
-		return 0;
+		return false;
 	      break;
 
 	    case CONST:
@@ -10257,12 +10257,12 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
 	    case SYMBOL_REF:
 	    case LABEL_REF:
 	      if (disp)
-		return 0;
+		return false;
 	      disp = op;
 	      break;
 
 	    default:
-	      return 0;
+	      return false;
 	    }
 	}
     }
@@ -10277,10 +10277,10 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
       index = XEXP (addr, 0);
       tmp = XEXP (addr, 1);
       if (!CONST_INT_P (tmp))
-	return 0;
+	return false;
       scale = INTVAL (tmp);
       if ((unsigned HOST_WIDE_INT) scale > 3)
-	return 0;
+	return false;
       scale = 1 << scale;
     }
   else
@@ -10294,14 +10294,14 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
 	       && REG_P (SUBREG_REG (index)))
 	;
       else
-	return 0;
+	return false;
     }
 
   /* Extract the integral value of scale.  */
   if (scale_rtx)
     {
       if (!CONST_INT_P (scale_rtx))
-	return 0;
+	return false;
       scale = INTVAL (scale_rtx);
     }
 
@@ -10354,7 +10354,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
   out->scale = scale;
   out->seg = seg;
 
-  return 1;
+  return true;
 }
 
 /* Return cost of the memory address x.
@@ -16976,6 +16976,7 @@ ix86_sched_init_global (FILE *, int, int)
     case PROCESSOR_NEHALEM:
     case PROCESSOR_SANDYBRIDGE:
     case PROCESSOR_HASWELL:
+    case PROCESSOR_TREMONT:
     case PROCESSOR_GENERIC:
       /* Do not perform multipass scheduling for pre-reload schedule
          to save compile time.  */
@@ -19443,8 +19444,11 @@ ix86_can_change_mode_class (machine_mode from, machine_mode to,
       /* Vector registers do not support QI or HImode loads.  If we don't
 	 disallow a change to these modes, reload will assume it's ok to
 	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
-	 the vec_dupv4hi pattern.  */
-      if (GET_MODE_SIZE (from) < 4)
+	 the vec_dupv4hi pattern.
+	 NB: AVX512FP16 supports vmovw which can load 16bit data to sse
+	 register.  */
+      int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 4;
+      if (GET_MODE_SIZE (from) < mov_size)
 	return false;
     }
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index e76bb55..ec60b89 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -334,6 +334,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_PARTIAL_REG_DEPENDENCY]
 #define TARGET_SSE_PARTIAL_REG_DEPENDENCY \
 	ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY]
+#define TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY \
+	ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY]
+#define TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY \
+	ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY]
 #define TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
 	ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
 #define TARGET_SSE_UNALIGNED_STORE_OPTIMAL \
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 13f6f57..c82a9dc 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4535,7 +4535,8 @@
         (float_extend:DF
           (match_operand:SF 1 "nonimmediate_operand")))]
   "!TARGET_AVX
-   && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+   && TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && (!REG_P (operands[1])
        || (!TARGET_AVX && REGNO (operands[0]) != REGNO (operands[1])))
@@ -4708,7 +4709,8 @@
         (float_truncate:SF
 	  (match_operand:DF 1 "nonimmediate_operand")))]
   "!TARGET_AVX
-   && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+   && TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && (!REG_P (operands[1])
        || (!TARGET_AVX && REGNO (operands[0]) != REGNO (operands[1])))
@@ -5243,7 +5245,8 @@
   [(set (match_operand:MODEF 0 "sse_reg_operand")
 	(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
   "!TARGET_AVX
-   && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+   && TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && (!EXT_REX_SSE_REG_P (operands[0])
        || TARGET_AVX512VL)"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 516eb45..d7a1328 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -396,6 +396,13 @@
 (define_mode_iterator VF1_AVX512ER_128_256
   [(V16SF "TARGET_AVX512ER") (V8SF "TARGET_AVX") V4SF])
 
+(define_mode_iterator VFH_AVX512VL
+  [(V32HF "TARGET_AVX512FP16")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
+   V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
+
 (define_mode_iterator VF2_AVX512VL
   [V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
 
@@ -405,6 +412,9 @@
 (define_mode_iterator VF_AVX512FP16
   [V32HF V16HF V8HF])
 
+(define_mode_iterator VF_AVX512FP16VL
+  [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")])
+
 ;; All vector integer modes
 (define_mode_iterator VI
   [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
@@ -493,6 +503,11 @@
 (define_mode_iterator VI2_AVX512VL
   [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI])
 
+(define_mode_iterator VI2H_AVX512VL
+  [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI
+   (V8SI "TARGET_AVX512VL") V16SI
+   V8DI ])
+
 (define_mode_iterator VI1_AVX512VL_F
   [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F")])
 
@@ -622,6 +637,9 @@
    (V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2")
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")])
 
+(define_mode_iterator VF4_128_8_256
+  [V4DF V4SF])
+
 (define_mode_iterator VI1_AVX512VLBW
   [(V64QI "TARGET_AVX512BW") (V32QI  "TARGET_AVX512VL")
 	(V16QI  "TARGET_AVX512VL")])
@@ -707,7 +725,8 @@
   [(V16SF "V4SF") (V8DF "V2DF") (V16SI "TI") (V8DI "TI")])
 
 (define_mode_attr vecmemsuffix
-  [(V16SF "{z}") (V8SF "{y}") (V4SF "{x}")
+  [(V32HF "{z}") (V16HF "{y}") (V8HF "{x}")
+   (V16SF "{z}") (V8SF "{y}") (V4SF "{x}")
    (V8DF "{z}") (V4DF "{y}") (V2DF "{x}")])
 
 (define_mode_attr ssedoublemodelower
@@ -727,6 +746,11 @@
   [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")
    (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")])
 
+(define_mode_attr sseintconvert
+  [(V32HI "w") (V16HI "w") (V8HI "w")
+   (V16SI "dq") (V8SI "dq") (V4SI "dq")
+   (V8DI "qq") (V4DI "qq") (V2DI "qq")])
+
 ;; All 128bit vector integer modes
 (define_mode_iterator VI_128 [V16QI V8HI V4SI V2DI])
 
@@ -768,6 +792,7 @@
   (V32HF "TARGET_AVX512BW")])
 
 ;; Int-float size matches
+(define_mode_iterator VI2F [V8HI V16HI V32HI V8HF V16HF V32HF])
 (define_mode_iterator VI4F_128 [V4SI V4SF])
 (define_mode_iterator VI8F_128 [V2DI V2DF])
 (define_mode_iterator VI4F_256 [V8SI V8SF])
@@ -782,6 +807,12 @@
   (V4DI  "TARGET_AVX512VL") (V4DF  "TARGET_AVX512VL")])
 (define_mode_iterator VF48_I1248
   [V16SI V16SF V8DI V8DF V32HI V64QI])
+(define_mode_iterator VF48H_AVX512VL
+  [V8DF V16SF (V8SF "TARGET_AVX512VL")])
+
+(define_mode_iterator VF48_128
+  [V2DF V4SF])
+
 (define_mode_iterator VI48F
   [V16SI V16SF V8DI V8DF
    (V8SI "TARGET_AVX512VL") (V8SF "TARGET_AVX512VL")
@@ -806,6 +837,7 @@
    (V8SF "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")
    V16SF V8DF])
 
+(define_mode_iterator V8_128 [V8HI V8HF])
 (define_mode_iterator V16_256 [V16HI V16HF])
 (define_mode_iterator V32_512 [V32HI V32HF])
 
@@ -918,9 +950,9 @@
 
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr sseintvecmode
-  [(V16SF "V16SI") (V8DF  "V8DI")
-   (V8SF  "V8SI")  (V4DF  "V4DI")
-   (V4SF  "V4SI")  (V2DF  "V2DI")
+  [(V32HF "V32HI") (V16SF "V16SI") (V8DF  "V8DI")
+   (V16HF "V16HI") (V8SF  "V8SI")  (V4DF  "V4DI")
+   (V8HF "V8HI") (V4SF  "V4SI")  (V2DF  "V2DI")
    (V16SI "V16SI") (V8DI  "V8DI")
    (V8SI  "V8SI")  (V4DI  "V4DI")
    (V4SI  "V4SI")  (V2DI  "V2DI")
@@ -971,6 +1003,13 @@
    (V4SF  "v2sf")
    (V32HF "v16hf") (V16HF "v8hf") (V8HF "v4hf")])
 
+;; Mapping of vector modes to vector hf modes of conversion.
+(define_mode_attr ssePHmode
+  [(V32HI "V32HF") (V16HI "V16HF") (V8HI "V8HF")
+   (V16SI "V16HF") (V8SI "V8HF") (V4SI "V8HF")
+   (V8DI "V8HF") (V4DI "V8HF") (V2DI "V8HF")
+   (V8DF "V8HF") (V16SF "V16HF") (V8SF "V8HF")])
+
 ;; Mapping of vector modes to packed single mode of the same size
 (define_mode_attr ssePSmode
   [(V16SI "V16SF") (V8DF "V16SF")
@@ -1116,7 +1155,8 @@
 
 ;; Mapping of mode to cast intrinsic name
 (define_mode_attr castmode
- [(V8SI "si") (V8SF "ps") (V4DF "pd")
+ [(V4SF "ps") (V2DF "pd")
+  (V8SI "si") (V8SF "ps") (V4DF "pd")
   (V16SI "si") (V16SF "ps") (V8DF "pd")])
 
 ;; i128 for integer vectors and TARGET_AVX2, f128 otherwise.
@@ -1349,13 +1389,13 @@
   [(set (match_dup 0) (match_dup 1))])
 
 (define_insn "avx512f_mov<ssescalarmodelower>_mask"
-  [(set (match_operand:VF_128 0 "register_operand" "=v")
-	(vec_merge:VF_128
-	  (vec_merge:VF_128
-	    (match_operand:VF_128 2 "register_operand" "v")
-	    (match_operand:VF_128 3 "nonimm_or_0_operand" "0C")
+  [(set (match_operand:VFH_128 0 "register_operand" "=v")
+	(vec_merge:VFH_128
+	  (vec_merge:VFH_128
+	    (match_operand:VFH_128 2 "register_operand" "v")
+	    (match_operand:VFH_128 3 "nonimm_or_0_operand" "0C")
 	    (match_operand:QI 4 "register_operand" "Yk"))
-	  (match_operand:VF_128 1 "register_operand" "v")
+	  (match_operand:VFH_128 1 "register_operand" "v")
 	  (const_int 1)))]
   "TARGET_AVX512F"
   "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}"
@@ -1368,7 +1408,7 @@
 	(vec_merge:<ssevecmode>
 	  (vec_merge:<ssevecmode>
 	    (vec_duplicate:<ssevecmode>
-	      (match_operand:MODEF 1 "memory_operand"))
+	      (match_operand:MODEFH 1 "memory_operand"))
 	    (match_operand:<ssevecmode> 2 "nonimm_or_0_operand")
 	    (match_operand:QI 3 "register_operand"))
 	  (match_dup 4)
@@ -1381,7 +1421,7 @@
 	(vec_merge:<ssevecmode>
 	  (vec_merge:<ssevecmode>
 	    (vec_duplicate:<ssevecmode>
-	      (match_operand:MODEF 1 "memory_operand" "m"))
+	      (match_operand:MODEFH 1 "memory_operand" "m"))
 	    (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
 	    (match_operand:QI 3 "register_operand" "Yk"))
 	  (match_operand:<ssevecmode> 4 "const0_operand" "C")
@@ -1394,11 +1434,11 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "avx512f_store<mode>_mask"
-  [(set (match_operand:MODEF 0 "memory_operand" "=m")
-	(if_then_else:MODEF
+  [(set (match_operand:MODEFH 0 "memory_operand" "=m")
+	(if_then_else:MODEFH
 	  (and:QI (match_operand:QI 2 "register_operand" "Yk")
 		 (const_int 1))
-	  (vec_select:MODEF
+	  (vec_select:MODEFH
 	    (match_operand:<ssevecmode> 1 "register_operand" "v")
 	    (parallel [(const_int 0)]))
 	  (match_dup 0)))]
@@ -2338,6 +2378,30 @@
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "SF")])
 
+(define_insn "avx512fp16_rcp<mode>2<mask_name>"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=v")
+	(unspec:VF_AVX512FP16VL
+	  [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "vm")]
+	  UNSPEC_RCP))]
+  "TARGET_AVX512FP16"
+  "vrcpph\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512fp16_vmrcpv8hf2<mask_scalar_name>"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_merge:V8HF
+	  (unspec:V8HF [(match_operand:V8HF 1 "nonimmediate_operand" "vm")]
+		       UNSPEC_RCP)
+	  (match_operand:V8HF 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512FP16"
+  "vrcpsh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %w1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "HF")])
+
 (define_insn "<mask_codefor>rcp14<mode><mask_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
 	(unspec:VF_AVX512VL
@@ -2381,8 +2445,8 @@
    (set_attr "mode" "<MODE>")])
 
 (define_expand "sqrt<mode>2"
-  [(set (match_operand:VF2 0 "register_operand")
-	(sqrt:VF2 (match_operand:VF2 1 "vector_operand")))]
+  [(set (match_operand:VF2H 0 "register_operand")
+	(sqrt:VF2H (match_operand:VF2H 1 "vector_operand")))]
   "TARGET_SSE2")
 
 (define_expand "sqrt<mode>2"
@@ -2402,8 +2466,8 @@
 })
 
 (define_insn "<sse>_sqrt<mode>2<mask_name><round_name>"
-  [(set (match_operand:VF 0 "register_operand" "=x,v")
-	(sqrt:VF (match_operand:VF 1 "<round_nimm_predicate>" "xBm,<round_constraint>")))]
+  [(set (match_operand:VFH 0 "register_operand" "=x,v")
+	(sqrt:VFH (match_operand:VFH 1 "<round_nimm_predicate>" "xBm,<round_constraint>")))]
   "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
   "@
    sqrt<ssemodesuffix>\t{%1, %0|%0, %1}
@@ -2416,11 +2480,11 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "<sse>_vmsqrt<mode>2<mask_scalar_name><round_scalar_name>"
-  [(set (match_operand:VF_128 0 "register_operand" "=x,v")
-	(vec_merge:VF_128
-	  (sqrt:VF_128
-	    (match_operand:VF_128 1 "nonimmediate_operand" "xm,<round_scalar_constraint>"))
-	  (match_operand:VF_128 2 "register_operand" "0,v")
+  [(set (match_operand:VFH_128 0 "register_operand" "=x,v")
+	(vec_merge:VFH_128
+	  (sqrt:VFH_128
+	    (match_operand:VFH_128 1 "nonimmediate_operand" "xm,<round_scalar_constraint>"))
+	  (match_operand:VFH_128 2 "register_operand" "0,v")
 	  (const_int 1)))]
   "TARGET_SSE"
   "@
@@ -2473,6 +2537,16 @@
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "<sse>_rsqrt<mode>2<mask_name>"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=v")
+	(unspec:VF_AVX512FP16VL
+	  [(match_operand:VF_AVX512FP16VL 1 "vector_operand" "vBm")] UNSPEC_RSQRT))]
+  "TARGET_AVX512FP16"
+  "vrsqrtph\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "<mask_codefor>rsqrt14<mode><mask_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
 	(unspec:VF_AVX512VL
@@ -2548,6 +2622,19 @@
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "SF")])
 
+(define_insn "avx512fp16_vmrsqrtv8hf2<mask_scalar_name>"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_merge:V8HF
+	  (unspec:V8HF [(match_operand:V8HF 1 "nonimmediate_operand" "vm")]
+		       UNSPEC_RSQRT)
+	  (match_operand:V8HF 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512FP16"
+  "vrsqrtsh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %w1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "HF")])
+
 (define_expand "cond_<code><mode>"
   [(set (match_operand:VF 0 "register_operand")
 	(vec_merge:VF
@@ -3200,28 +3287,28 @@
 })
 
 (define_insn "<mask_codefor>reducep<mode><mask_name><round_saeonly_name>"
-  [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
-	(unspec:VF_AVX512VL
-	  [(match_operand:VF_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
+	(unspec:VFH_AVX512VL
+	  [(match_operand:VFH_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
 	   (match_operand:SI 2 "const_0_to_255_operand")]
 	  UNSPEC_REDUCE))]
-  "TARGET_AVX512DQ"
+  "TARGET_AVX512DQ || (VALID_AVX512FP16_REG_MODE (<MODE>mode))"
   "vreduce<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}"
   [(set_attr "type" "sse")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "reduces<mode><mask_scalar_name><round_saeonly_scalar_name>"
-  [(set (match_operand:VF_128 0 "register_operand" "=v")
-	(vec_merge:VF_128
-	  (unspec:VF_128
-	    [(match_operand:VF_128 1 "register_operand" "v")
-	     (match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
+  [(set (match_operand:VFH_128 0 "register_operand" "=v")
+	(vec_merge:VFH_128
+	  (unspec:VFH_128
+	    [(match_operand:VFH_128 1 "register_operand" "v")
+	     (match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
 	     (match_operand:SI 3 "const_0_to_255_operand")]
 	    UNSPEC_REDUCE)
 	  (match_dup 1)
 	  (const_int 1)))]
-  "TARGET_AVX512DQ"
+  "TARGET_AVX512DQ || (VALID_AVX512FP16_REG_MODE (<MODE>mode))"
   "vreduce<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"
   [(set_attr "type" "sse")
    (set_attr "prefix" "evex")
@@ -5655,6 +5742,552 @@
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
+;; Parallel half-precision floating point conversion operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_int_iterator UNSPEC_US_FIX_NOTRUNC
+	[UNSPEC_UNSIGNED_FIX_NOTRUNC UNSPEC_FIX_NOTRUNC])
+
+(define_int_attr sseintconvertsignprefix
+	[(UNSPEC_UNSIGNED_FIX_NOTRUNC "u")
+	 (UNSPEC_FIX_NOTRUNC "")])
+
+(define_mode_attr qq2phsuff
+  [(V32HI "") (V16HI "") (V8HI "")
+   (V16SI "") (V8SI "{y}") (V4SI "{x}")
+   (V8DI "{z}") (V4DI "{y}") (V2DI "{x}")
+   (V16SF "") (V8SF "{y}") (V4SF "{x}")
+   (V8DF "{z}") (V4DF "{y}") (V2DF "{x}")])
+
+(define_insn "avx512fp16_vcvtph2<sseintconvertsignprefix><sseintconvert>_<mode><mask_name><round_name>"
+  [(set (match_operand:VI248_AVX512VL 0 "register_operand" "=v")
+        (unspec:VI248_AVX512VL
+	   [(match_operand:<ssePHmode> 1 "<round_nimm_predicate>" "<round_constraint>")]
+	   UNSPEC_US_FIX_NOTRUNC))]
+  "TARGET_AVX512FP16"
+  "vcvtph2<sseintconvertsignprefix><sseintconvert>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode><mask_name><round_name>"
+  [(set (match_operand:<ssePHmode> 0 "register_operand" "=v")
+	(any_float:<ssePHmode>
+	  (match_operand:VI2H_AVX512VL 1 "<round_nimm_predicate>" "<round_constraint>")))]
+  "TARGET_AVX512FP16"
+  "vcvt<floatsuffix><sseintconvert>2ph<round_qq2phsuff>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	    (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
+	    (match_dup 2)))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "operands[2] = CONST0_RTX (V4HFmode);")
+
+(define_insn "*avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	    (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
+	    (match_operand:V4HF 2 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>_mask"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+    (vec_concat:V8HF
+        (vec_merge:V4HF
+	    (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
+            (vec_select:V4HF
+                (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+                (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))
+            (match_operand:QI 3 "register_operand" "Yk"))
+	    (match_dup 4)))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "operands[4] = CONST0_RTX (V4HFmode);")
+
+(define_insn "*avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>_mask"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+    (vec_concat:V8HF
+        (vec_merge:V4HF
+	    (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
+            (vec_select:V4HF
+                (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+                (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))
+            (match_operand:QI 3 "register_operand" "Yk"))
+	    (match_operand:V4HF 4 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>_mask_1"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+    (vec_concat:V8HF
+	(vec_merge:V4HF
+		(any_float:V4HF (match_operand:VI4_128_8_256 1
+				  "vector_operand" "vm"))
+	    (match_operand:V4HF 3 "const0_operand" "C")
+	    (match_operand:QI 2 "register_operand" "Yk"))
+	    (match_operand:V4HF 4 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvt<floatsuffix>qq2ph_v2di"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	    (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
+	    (match_dup 2)))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "operands[2] = CONST0_RTX (V6HFmode);")
+
+(define_insn "*avx512fp16_vcvt<floatsuffix>qq2ph_v2di"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	    (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
+	    (match_operand:V6HF 2 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvt<floatsuffix>qq2ph{x}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_expand "avx512fp16_vcvt<floatsuffix>qq2ph_v2di_mask"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+    (vec_concat:V8HF
+        (vec_merge:V2HF
+	    (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
+            (vec_select:V2HF
+                (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+                (parallel [(const_int 0) (const_int 1)]))
+            (match_operand:QI 3 "register_operand" "Yk"))
+	    (match_dup 4)))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "operands[4] = CONST0_RTX (V6HFmode);")
+
+(define_insn "*avx512fp16_vcvt<floatsuffix>qq2ph_v2di_mask"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+    (vec_concat:V8HF
+        (vec_merge:V2HF
+	    (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
+            (vec_select:V2HF
+                (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+                (parallel [(const_int 0) (const_int 1)]))
+            (match_operand:QI 3 "register_operand" "Yk"))
+	    (match_operand:V6HF 4 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvt<floatsuffix>qq2ph{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*avx512fp16_vcvt<floatsuffix>qq2ph_v2di_mask_1"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+    (vec_concat:V8HF
+	(vec_merge:V2HF
+		(any_float:V2HF (match_operand:V2DI 1
+				  "vector_operand" "vm"))
+	    (match_operand:V2HF 3 "const0_operand" "C")
+	    (match_operand:QI 2 "register_operand" "Yk"))
+	    (match_operand:V6HF 4 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvt<floatsuffix>qq2ph{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvtsh2<sseintconvertsignprefix>si<rex64namesuffix><round_name>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(unspec:SWI48
+	  [(vec_select:HF
+	     (match_operand:V8HF 1 "register_operand" "v")
+	     (parallel [(const_int 0)]))]
+	  UNSPEC_US_FIX_NOTRUNC))]
+  "TARGET_AVX512FP16"
+  "vcvtsh2<sseintconvertsignprefix>si\t{<round_op2>%1, %0|%0, %1<round_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512fp16_vcvtsh2<sseintconvertsignprefix>si<rex64namesuffix>_2"
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+	(unspec:SWI48
+	  [(match_operand:HF 1 "nonimmediate_operand" "v,m")]
+	  UNSPEC_US_FIX_NOTRUNC))]
+  "TARGET_AVX512FP16"
+  "vcvtsh2<sseintconvertsignprefix>si\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_mode_attr sseicvtsuffix
+  [(SI "l") (DI "q")])
+(define_insn "avx512fp16_vcvt<floatsuffix>si2sh<rex64namesuffix><round_name>"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_merge:V8HF
+	  (vec_duplicate:V8HF
+	    (any_float:HF
+	      (match_operand:SWI48 2 "<round_nimm_scalar_predicate>" "<round_constraint3>")))
+	  (match_operand:V8HF 1 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512FP16"
+  "vcvt<floatsuffix>si2sh{<sseicvtsuffix>}\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "HF")])
+
+(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name><round_saeonly_name>"
+  [(set (match_operand:VI2H_AVX512VL 0 "register_operand" "=v")
+	(any_fix:VI2H_AVX512VL
+	  (match_operand:<ssePHmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))]
+  "TARGET_AVX512FP16"
+  "vcvttph2<fixsuffix><sseintconvert>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name>"
+  [(set (match_operand:VI4_128_8_256 0 "register_operand" "=v")
+	(any_fix:VI4_128_8_256
+	  (vec_select:V4HF
+	    (match_operand:V8HF 1 "register_operand" "v")
+	    (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvttph2<fixsuffix><sseintconvert>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx512fp16_fix<fixunssuffix>_trunc<mode>2_load<mask_name>"
+  [(set (match_operand:VI4_128_8_256 0 "register_operand" "=v")
+	(any_fix:VI4_128_8_256
+	  (match_operand:V4HF 1 "memory_operand" "m")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvttph2<fixsuffix><sseintconvert>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512fp16_fix<fixunssuffix>_truncv2di2<mask_name>"
+  [(set (match_operand:V2DI 0 "register_operand" "=v")
+	(any_fix:V2DI
+	  (vec_select:V2HF
+	    (match_operand:V8HF 1 "nonimmediate_operand" "v")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvttph2<fixsuffix>qq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*avx512fp16_fix<fixunssuffix>_truncv2di2_load<mask_name>"
+  [(set (match_operand:V2DI 0 "register_operand" "=v")
+	(any_fix:V2DI
+	  (match_operand:V2HF 1 "memory_operand" "m")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvttph2<fixsuffix>qq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<round_saeonly_name>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(any_fix:SWI48
+	  (vec_select:HF
+	    (match_operand:V8HF 1 "register_operand" "v")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX512FP16"
+  "%vcvttsh2<fixsuffix>si\t{<round_saeonly_op2>%1, %0|%0, %k1<round_saeonly_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2_mem"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(any_fix:SWI48
+	  (match_operand:HF 1 "memory_operand" "vm")))]
+  "TARGET_AVX512FP16"
+  "%vcvttsh2<fixsuffix>si\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_mode_attr ph2pssuffix
+  [(V16SF "x") (V8SF "x") (V4SF "x")
+   (V8DF "") (V4DF "") (V2DF "")])
+
+(define_insn "avx512fp16_float_extend_ph<mode>2<mask_name><round_saeonly_name>"
+  [(set (match_operand:VF48H_AVX512VL 0 "register_operand" "=v")
+	(float_extend:VF48H_AVX512VL
+	  (match_operand:<ssePHmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))]
+  "TARGET_AVX512FP16"
+  "vcvtph2<castmode><ph2pssuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512fp16_float_extend_ph<mode>2<mask_name>"
+  [(set (match_operand:VF4_128_8_256 0 "register_operand" "=v")
+	(float_extend:VF4_128_8_256
+	  (vec_select:V4HF
+	    (match_operand:V8HF 1 "register_operand" "v")
+	    (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvtph2<castmode><ph2pssuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx512fp16_float_extend_ph<mode>2_load<mask_name>"
+  [(set (match_operand:VF4_128_8_256 0 "register_operand" "=v")
+	(float_extend:VF4_128_8_256
+	  (match_operand:V4HF 1 "memory_operand" "m")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvtph2<castmode><ph2pssuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512fp16_float_extend_phv2df2<mask_name>"
+  [(set (match_operand:V2DF 0 "register_operand" "=v")
+	(float_extend:V2DF
+	  (vec_select:V2HF
+	    (match_operand:V8HF 1 "register_operand" "v")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvtph2pd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*avx512fp16_float_extend_phv2df2_load<mask_name>"
+  [(set (match_operand:V2DF 0 "register_operand" "=v")
+	(float_extend:V2DF
+	  (match_operand:V2HF 1 "memory_operand" "m")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvtph2pd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvt<castmode>2ph_<mode><mask_name><round_name>"
+  [(set (match_operand:<ssePHmode> 0 "register_operand" "=v")
+	(float_truncate:<ssePHmode>
+	  (match_operand:VF48H_AVX512VL 1 "<round_nimm_predicate>" "<round_constraint>")))]
+  "TARGET_AVX512FP16"
+  "vcvt<castmode>2ph<ph2pssuffix><round_qq2phsuff>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvt<castmode>2ph_<mode>"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	    (float_truncate:V4HF
+	      (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+	    (match_dup 2)))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "operands[2] = CONST0_RTX (V4HFmode);")
+
+(define_insn "*avx512fp16_vcvt<castmode>2ph_<mode>"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	    (float_truncate:V4HF
+	      (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+	    (match_operand:V4HF 2 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvt<castmode>2ph_<mode>_mask"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	  (vec_merge:V4HF
+	    (float_truncate:V4HF
+	      (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+	    (vec_select:V4HF
+	      (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+	      (parallel [(const_int 0) (const_int 1)
+			 (const_int 2) (const_int 3)]))
+	    (match_operand:QI 3 "register_operand" "Yk"))
+	  (match_dup 4)))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "operands[4] = CONST0_RTX (V4HFmode);")
+
+(define_insn "*avx512fp16_vcvt<castmode>2ph_<mode>_mask"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	  (vec_merge:V4HF
+	    (float_truncate:V4HF
+	      (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+	    (vec_select:V4HF
+	      (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+	      (parallel [(const_int 0) (const_int 1)
+			 (const_int 2) (const_int 3)]))
+	    (match_operand:QI 3 "register_operand" "Yk"))
+	  (match_operand:V4HF 4 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx512fp16_vcvt<castmode>2ph_<mode>_mask_1"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	  (vec_merge:V4HF
+	    (float_truncate:V4HF
+	      (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+	    (match_operand:V4HF 3 "const0_operand" "C")
+	    (match_operand:QI 2 "register_operand" "Yk"))
+	  (match_operand:V4HF 4 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvtpd2ph_v2df"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	  (float_truncate:V2HF
+	    (match_operand:V2DF 1 "vector_operand" "vm"))
+	  (match_dup 2)))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "operands[2] = CONST0_RTX (V6HFmode);")
+
+(define_insn "*avx512fp16_vcvtpd2ph_v2df"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	  (float_truncate:V2HF
+	    (match_operand:V2DF 1 "vector_operand" "vm"))
+	  (match_operand:V6HF 2 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvtpd2ph{x}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_expand "avx512fp16_vcvtpd2ph_v2df_mask"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	  (vec_merge:V2HF
+	    (float_truncate:V2HF
+	      (match_operand:V2DF 1 "vector_operand" "vm"))
+	    (vec_select:V2HF
+	      (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+	      (parallel [(const_int 0) (const_int 1)]))
+	    (match_operand:QI 3 "register_operand" "Yk"))
+	  (match_dup 4)))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "operands[4] = CONST0_RTX (V6HFmode);")
+
+(define_insn "*avx512fp16_vcvtpd2ph_v2df_mask"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	  (vec_merge:V2HF
+	    (float_truncate:V2HF (match_operand:V2DF 1 "vector_operand" "vm"))
+	    (vec_select:V2HF
+	      (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+	      (parallel [(const_int 0) (const_int 1)]))
+	    (match_operand:QI 3 "register_operand" "Yk"))
+	  (match_operand:V6HF 4 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvtpd2ph{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*avx512fp16_vcvtpd2ph_v2df_mask_1"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_concat:V8HF
+	  (vec_merge:V2HF
+	    (float_truncate:V2HF
+	      (match_operand:V2DF 1 "vector_operand" "vm"))
+	    (match_operand:V2HF 3 "const0_operand" "C")
+	    (match_operand:QI 2 "register_operand" "Yk"))
+	  (match_operand:V6HF 4 "const0_operand" "C")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "vcvtpd2ph{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvtsh2<ssescalarmodesuffix><mask_scalar_name><round_saeonly_scalar_name>"
+  [(set (match_operand:VF48_128 0 "register_operand" "=v")
+	(vec_merge:VF48_128
+	  (vec_duplicate:VF48_128
+	    (float_extend:<ssescalarmode>
+	      (vec_select:HF
+		(match_operand:V8HF 1 "register_operand" "v")
+		(parallel [(const_int 0)]))))
+	  (match_operand:VF48_128 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512FP16"
+  "vcvtsh2<ssescalarmodesuffix>\t{<round_saeonly_scalar_mask_op3>%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1<round_saeonly_scalar_mask_op3>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvtsh2<ssescalarmodesuffix><mask_scalar_name>_mem"
+  [(set (match_operand:VF48_128 0 "register_operand" "=v")
+	(vec_merge:VF48_128
+	  (vec_duplicate:VF48_128
+	    (float_extend:<ssescalarmode>
+	      (match_operand:HF 1 "memory_operand" "m")))
+	  (match_operand:VF48_128 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512FP16"
+  "vcvtsh2<ssescalarmodesuffix>\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvt<ssescalarmodesuffix>2sh<mask_scalar_name><round_scalar_name>"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_merge:V8HF
+	  (vec_duplicate:V8HF
+	    (float_truncate:HF
+	      (vec_select:<ssescalarmode>
+		(match_operand:VF48_128 1 "register_operand" "v")
+		(parallel [(const_int 0)]))))
+	  (match_operand:V8HF 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512FP16"
+  "vcvt<ssescalarmodesuffix>2sh\t{<round_scalar_mask_op3>%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1<round_scalar_mask_op3>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvt<ssescalarmodesuffix>2sh<mask_scalar_name>_mem"
+  [(set (match_operand:V8HF 0 "register_operand" "=v")
+	(vec_merge:V8HF
+	  (vec_duplicate:V8HF
+	    (float_truncate:HF
+	      (match_operand:MODEF 1 "memory_operand" "m")))
+	  (match_operand:V8HF 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512FP16"
+  "vcvt<ssescalarmodesuffix>2sh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
 ;; Parallel single-precision floating point conversion operations
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -8759,11 +9392,11 @@
 
 ;; vmovw clears also the higer bits
 (define_insn "vec_set<mode>_0"
-  [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v,v")
-	(vec_merge:VF_AVX512FP16
-	  (vec_duplicate:VF_AVX512FP16
-	    (match_operand:HF 2 "nonimmediate_operand" "r,m"))
-	  (match_operand:VF_AVX512FP16 1 "const0_operand" "C,C")
+  [(set (match_operand:VI2F 0 "register_operand" "=v,v")
+	(vec_merge:VI2F
+	  (vec_duplicate:VI2F
+	    (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,m"))
+	  (match_operand:VI2F 1 "const0_operand" "C,C")
 	  (const_int 1)))]
   "TARGET_AVX512FP16"
   "@
@@ -9031,7 +9664,8 @@
   [(V16SF "avx512f") (V16SI "avx512f") (V8DF "avx512dq") (V8DI "avx512dq")])
 
 (define_mode_attr extract_suf
-  [(V16SF "32x4") (V16SI "32x4") (V8DF "64x2") (V8DI "64x2")])
+  [(V16SF "32x4") (V16SI "32x4") (V8DF "64x2") (V8DI "64x2")
+   (V8SF "32x4") (V8SI "32x4") (V4DF "64x2") (V4DI "64x2")])
 
 (define_mode_iterator AVX512_VEC
   [(V8DF "TARGET_AVX512DQ") (V8DI "TARGET_AVX512DQ") V16SF V16SI])
@@ -9891,16 +10525,33 @@
   "operands[1] = gen_lowpart (HFmode, operands[1]);")
 
 (define_insn "*vec_extracthf"
-  [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=r,m")
+  [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=*r,m,x,v")
 	(vec_select:HF
-	  (match_operand:V8HF 1 "register_operand" "v,v")
+	  (match_operand:V8HF 1 "register_operand" "v,v,0,v")
 	  (parallel
 	    [(match_operand:SI 2 "const_0_to_7_operand")])))]
   "TARGET_SSE2"
-  "@
-   vpextrw\t{%2, %1, %k0|%k0, %1, %2}
-   vpextrw\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sselog1")
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "vpextrw\t{%2, %1, %k0|%k0, %1, %2}";
+    case 1:
+      return "vpextrw\t{%2, %1, %0|%0, %1, %2}";
+
+    case 2:
+      operands[2] = GEN_INT (INTVAL (operands[2]) * 2);
+      return "psrldq\t{%2, %0|%0, %2}";
+    case 3:
+      operands[2] = GEN_INT (INTVAL (operands[2]) * 2);
+      return "vpsrldq\t{%2, %1, %0|%0, %1, %2}";
+
+    default:
+      gcc_unreachable ();
+   }
+}
+  [(set_attr "isa" "*,*,noavx,avx")
+   (set_attr "type" "sselog1,sselog1,sseishft1,sseishft1")
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "TI")])
 
@@ -10255,11 +10906,11 @@
 })
 
 (define_insn "avx512f_vmscalef<mode><mask_scalar_name><round_scalar_name>"
-  [(set (match_operand:VF_128 0 "register_operand" "=v")
-	(vec_merge:VF_128
-	  (unspec:VF_128
-	    [(match_operand:VF_128 1 "register_operand" "v")
-	     (match_operand:VF_128 2 "<round_scalar_nimm_predicate>" "<round_scalar_constraint>")]
+  [(set (match_operand:VFH_128 0 "register_operand" "=v")
+	(vec_merge:VFH_128
+	  (unspec:VFH_128
+	    [(match_operand:VFH_128 1 "register_operand" "v")
+	     (match_operand:VFH_128 2 "<round_scalar_nimm_predicate>" "<round_scalar_constraint>")]
 	    UNSPEC_SCALEF)
 	  (match_dup 1)
 	  (const_int 1)))]
@@ -10269,10 +10920,10 @@
    (set_attr "mode"  "<ssescalarmode>")])
 
 (define_insn "<avx512>_scalef<mode><mask_name><round_name>"
-  [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
-	(unspec:VF_AVX512VL
-	  [(match_operand:VF_AVX512VL 1 "register_operand" "v")
-	   (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")]
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
+	(unspec:VFH_AVX512VL
+	  [(match_operand:VFH_AVX512VL 1 "register_operand" "v")
+	   (match_operand:VFH_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")]
 	  UNSPEC_SCALEF))]
   "TARGET_AVX512F"
   "vscalef<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}"
@@ -10558,8 +11209,8 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_getexp<mode><mask_name><round_saeonly_name>"
-  [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
-        (unspec:VF_AVX512VL [(match_operand:VF_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
+        (unspec:VFH_AVX512VL [(match_operand:VFH_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
                         UNSPEC_GETEXP))]
    "TARGET_AVX512F"
    "vgetexp<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}";
@@ -10567,11 +11218,11 @@
      (set_attr "mode" "<MODE>")])
 
 (define_insn "avx512f_sgetexp<mode><mask_scalar_name><round_saeonly_scalar_name>"
-  [(set (match_operand:VF_128 0 "register_operand" "=v")
-	(vec_merge:VF_128
-	  (unspec:VF_128
-	    [(match_operand:VF_128 1 "register_operand" "v")
-	     (match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")]
+  [(set (match_operand:VFH_128 0 "register_operand" "=v")
+	(vec_merge:VFH_128
+	  (unspec:VFH_128
+	    [(match_operand:VFH_128 1 "register_operand" "v")
+	     (match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")]
 	    UNSPEC_GETEXP)
 	  (match_dup 1)
 	  (const_int 1)))]
@@ -10603,9 +11254,21 @@
 	  (match_operand:V48_256_512_AVX512VL 1 "register_operand" "v")
 	  (parallel [(match_operand 2 "<vec_extract_imm_predicate>")])))]
   "TARGET_AVX512F
-   && INTVAL(operands[2]) >= 16 / GET_MODE_SIZE (<ssescalarmode>mode)"
-  "valign<ternlogsuffix>\t{%2, %1, %1, %<xtg_mode>0|%<xtg_mode>0, %1, %1, %2}";
-  [(set_attr "prefix" "evex")
+   && INTVAL(operands[2]) * GET_MODE_SIZE (<ssescalarmode>mode) >= 16"
+{
+  int byte_offset = INTVAL (operands[2]) * GET_MODE_SIZE (<ssescalarmode>mode);
+  if (byte_offset % 16 == 0)
+    {
+      operands[2] = GEN_INT (byte_offset / 16);
+      if (byte_offset / 16 == 1)
+	return "vextract<shuffletype><extract_suf>\t{%2, %t1, %x0|%x0, %t1, %2}";
+      else
+	return "vextract<shuffletype><extract_suf>\t{%2, %1, %x0|%x0, %1, %2}";
+    }
+  else
+    return "valign<ternlogsuffix>\t{%2, %1, %1, %<xtg_mode>0|%<xtg_mode>0, %1, %1, %2}";
+}
+  [(set_attr "prefix" "maybe_evex")
    (set_attr "mode" "<sseintvecinsnmode>")])
 
 (define_expand "avx512f_shufps512_mask"
@@ -10737,9 +11400,9 @@
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_insn "<avx512>_rndscale<mode><mask_name><round_saeonly_name>"
-  [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
-	(unspec:VF_AVX512VL
-	  [(match_operand:VF_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>")
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
+	(unspec:VFH_AVX512VL
+	  [(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>")
 	   (match_operand:SI 2 "const_0_to_255_operand")]
 	  UNSPEC_ROUND))]
   "TARGET_AVX512F"
@@ -10749,13 +11412,13 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "avx512f_rndscale<mode><mask_scalar_name><round_saeonly_scalar_name>"
-  [(set (match_operand:VF_128 0 "register_operand" "=v")
-	(vec_merge:VF_128
-	  (unspec:VF_128
-	    [(match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
+  [(set (match_operand:VFH_128 0 "register_operand" "=v")
+	(vec_merge:VFH_128
+	  (unspec:VFH_128
+	    [(match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
 	     (match_operand:SI 3 "const_0_to_255_operand")]
 	    UNSPEC_ROUND)
-	  (match_operand:VF_128 1 "register_operand" "v")
+	  (match_operand:VFH_128 1 "register_operand" "v")
 	  (const_int 1)))]
   "TARGET_AVX512F"
   "vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"
@@ -10764,14 +11427,14 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*avx512f_rndscale<mode><round_saeonly_name>"
-  [(set (match_operand:VF_128 0 "register_operand" "=v")
-	(vec_merge:VF_128
-	  (vec_duplicate:VF_128
+  [(set (match_operand:VFH_128 0 "register_operand" "=v")
+	(vec_merge:VFH_128
+	  (vec_duplicate:VFH_128
 	    (unspec:<ssescalarmode>
 	      [(match_operand:<ssescalarmode> 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
 	       (match_operand:SI 3 "const_0_to_255_operand")]
 	      UNSPEC_ROUND))
-          (match_operand:VF_128 1 "register_operand" "v")
+          (match_operand:VFH_128 1 "register_operand" "v")
 	  (const_int 1)))]
   "TARGET_AVX512F"
   "vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %2<round_saeonly_op4>, %3}"
@@ -15359,12 +16022,12 @@
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
-(define_insn "avx512bw_interleave_highv32hi<mask_name>"
-  [(set (match_operand:V32HI 0 "register_operand" "=v")
-	(vec_select:V32HI
-	  (vec_concat:V64HI
-	    (match_operand:V32HI 1 "register_operand" "v")
-	    (match_operand:V32HI 2 "nonimmediate_operand" "vm"))
+(define_insn "avx512bw_interleave_high<mode><mask_name>"
+  [(set (match_operand:V32_512 0 "register_operand" "=v")
+	(vec_select:V32_512
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V32_512 1 "register_operand" "v")
+	    (match_operand:V32_512 2 "nonimmediate_operand" "vm"))
 	  (parallel [(const_int 4) (const_int 36)
 		     (const_int 5) (const_int 37)
 		     (const_int 6) (const_int 38)
@@ -15387,12 +16050,12 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
-(define_insn "avx2_interleave_highv16hi<mask_name>"
-  [(set (match_operand:V16HI 0 "register_operand" "=Yw")
-	(vec_select:V16HI
-	  (vec_concat:V32HI
-	    (match_operand:V16HI 1 "register_operand" "Yw")
-	    (match_operand:V16HI 2 "nonimmediate_operand" "Ywm"))
+(define_insn "avx2_interleave_high<mode><mask_name>"
+  [(set (match_operand:V16_256 0 "register_operand" "=Yw")
+	(vec_select:V16_256
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V16_256 1 "register_operand" "Yw")
+	    (match_operand:V16_256 2 "nonimmediate_operand" "Ywm"))
 	  (parallel [(const_int 4) (const_int 20)
 		     (const_int 5) (const_int 21)
 		     (const_int 6) (const_int 22)
@@ -15407,12 +16070,12 @@
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "OI")])
 
-(define_insn "vec_interleave_highv8hi<mask_name>"
-  [(set (match_operand:V8HI 0 "register_operand" "=x,Yw")
-	(vec_select:V8HI
-	  (vec_concat:V16HI
-	    (match_operand:V8HI 1 "register_operand" "0,Yw")
-	    (match_operand:V8HI 2 "vector_operand" "xBm,Ywm"))
+(define_insn "vec_interleave_high<mode><mask_name>"
+  [(set (match_operand:V8_128 0 "register_operand" "=x,Yw")
+	(vec_select:V8_128
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V8_128 1 "register_operand" "0,Yw")
+	    (match_operand:V8_128 2 "vector_operand" "xBm,Ywm"))
 	  (parallel [(const_int 4) (const_int 12)
 		     (const_int 5) (const_int 13)
 		     (const_int 6) (const_int 14)
@@ -15427,12 +16090,12 @@
    (set_attr "prefix" "orig,maybe_vex")
    (set_attr "mode" "TI")])
 
-(define_insn "<mask_codefor>avx512bw_interleave_lowv32hi<mask_name>"
-  [(set (match_operand:V32HI 0 "register_operand" "=v")
-	(vec_select:V32HI
-	  (vec_concat:V64HI
-	    (match_operand:V32HI 1 "register_operand" "v")
-	    (match_operand:V32HI 2 "nonimmediate_operand" "vm"))
+(define_insn "<mask_codefor>avx512bw_interleave_low<mode><mask_name>"
+  [(set (match_operand:V32_512 0 "register_operand" "=v")
+	(vec_select:V32_512
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V32_512 1 "register_operand" "v")
+	    (match_operand:V32_512 2 "nonimmediate_operand" "vm"))
 	  (parallel [(const_int 0) (const_int 32)
 		     (const_int 1) (const_int 33)
 		     (const_int 2) (const_int 34)
@@ -15455,12 +16118,12 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
-(define_insn "avx2_interleave_lowv16hi<mask_name>"
-  [(set (match_operand:V16HI 0 "register_operand" "=Yw")
-	(vec_select:V16HI
-	  (vec_concat:V32HI
-	    (match_operand:V16HI 1 "register_operand" "Yw")
-	    (match_operand:V16HI 2 "nonimmediate_operand" "Ywm"))
+(define_insn "avx2_interleave_low<mode><mask_name>"
+  [(set (match_operand:V16_256 0 "register_operand" "=Yw")
+	(vec_select:V16_256
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V16_256 1 "register_operand" "Yw")
+	    (match_operand:V16_256 2 "nonimmediate_operand" "Ywm"))
 	  (parallel [(const_int 0) (const_int 16)
 		     (const_int 1) (const_int 17)
 		     (const_int 2) (const_int 18)
@@ -15475,12 +16138,12 @@
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "OI")])
 
-(define_insn "vec_interleave_lowv8hi<mask_name>"
-  [(set (match_operand:V8HI 0 "register_operand" "=x,Yw")
-	(vec_select:V8HI
-	  (vec_concat:V16HI
-	    (match_operand:V8HI 1 "register_operand" "0,Yw")
-	    (match_operand:V8HI 2 "vector_operand" "xBm,Ywm"))
+(define_insn "vec_interleave_low<mode><mask_name>"
+  [(set (match_operand:V8_128 0 "register_operand" "=x,Yw")
+	(vec_select:V8_128
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:V8_128 1 "register_operand" "0,Yw")
+	    (match_operand:V8_128 2 "vector_operand" "xBm,Ywm"))
 	  (parallel [(const_int 0) (const_int 8)
 		     (const_int 1) (const_int 9)
 		     (const_int 2) (const_int 10)
@@ -15655,6 +16318,7 @@
    (V4SI "avx512dq") (V2DI "avx512dq")])
 
 ;; sse4_1_pinsrd must come before sse2_loadld since it is preferred.
+;; For V8HFmode and TARGET_AVX2, broadcastw + pblendw should be better.
 (define_insn "<sse2p4_1>_pinsr<ssemodesuffix>"
   [(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x,v,v")
 	(vec_merge:PINSR_MODE
@@ -15664,7 +16328,8 @@
 	  (match_operand:SI 3 "const_int_operand")))]
   "TARGET_SSE2
    && ((unsigned) exact_log2 (INTVAL (operands[3]))
-       < GET_MODE_NUNITS (<MODE>mode))"
+       < GET_MODE_NUNITS (<MODE>mode))
+   && !(<MODE>mode == V8HFmode && TARGET_AVX2)"
 {
   operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
 
@@ -15672,26 +16337,18 @@
     {
     case 0:
       if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
-	return "pinsr<ssemodesuffix>\t{%3, %k2, %0|%0, %k2, %3}";
+	return "pinsr<sseintmodesuffix>\t{%3, %k2, %0|%0, %k2, %3}";
       /* FALLTHRU */
     case 1:
-      return "pinsr<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}";
+      return "pinsr<sseintmodesuffix>\t{%3, %2, %0|%0, %2, %3}";
     case 2:
     case 4:
       if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
-	{
-	  if (<MODE>mode == V8HFmode)
-	    return "vpinsrw\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
-	  else
-	    return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
-	}
+	return "vpinsr<sseintmodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
       /* FALLTHRU */
     case 3:
     case 5:
-      if (<MODE>mode == V8HFmode)
-	return "vpinsrw\t{%3, %2, %1, %0|%0, %1, %2, %3}";
-      else
-	return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+      return "vpinsr<sseintmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
     default:
       gcc_unreachable ();
     }
@@ -19179,11 +19836,14 @@
 	  (lt:VI1_AVX2 (match_dup 3) (match_dup 4))] UNSPEC_BLENDV))]
   "operands[3] = gen_lowpart (<MODE>mode, operands[3]);")
 
-(define_insn "sse4_1_pblendw"
-  [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x")
-	(vec_merge:V8HI
-	  (match_operand:V8HI 2 "vector_operand" "YrBm,*xBm,xm")
-	  (match_operand:V8HI 1 "register_operand" "0,0,x")
+(define_mode_attr blendsuf
+  [(V8HI "w") (V8HF "ph")])
+
+(define_insn "sse4_1_pblend<blendsuf>"
+  [(set (match_operand:V8_128 0 "register_operand" "=Yr,*x,x")
+	(vec_merge:V8_128
+	  (match_operand:V8_128 2 "vector_operand" "YrBm,*xBm,xm")
+	  (match_operand:V8_128 1 "register_operand" "0,0,x")
 	  (match_operand:SI 3 "const_0_to_255_operand" "n,n,n")))]
   "TARGET_SSE4_1"
   "@
@@ -19210,6 +19870,47 @@
   operands[3] = GEN_INT (val << 8 | val);
 })
 
+(define_expand "avx2_pblendph"
+  [(set (match_operand:V16HF 0 "register_operand")
+	(vec_merge:V16HF
+	  (match_operand:V16HF 2 "register_operand")
+	  (match_operand:V16HF 1 "register_operand")
+	  (match_operand:SI 3 "const_int_operand")))]
+  "TARGET_AVX2
+  && !((INTVAL (operands[3]) & 0xff) && (INTVAL (operands[3]) & 0xff00))"
+{
+  int mask = INTVAL (operands[3]);
+  if (mask == 0)
+    emit_move_insn (operands[0], operands[1]);
+  else
+   {
+     rtx tmp = gen_reg_rtx (V16HImode);
+     rtx blendw_idx, blendd_idx;
+
+     if (mask & 0xff)
+       {
+	 blendw_idx = GEN_INT (mask & 0xff);
+	 blendd_idx = GEN_INT (15);
+       }
+     else
+       {
+	 blendw_idx = GEN_INT (mask >> 8 & 0xff);
+	 blendd_idx = GEN_INT (240);
+       }
+     operands[1] = lowpart_subreg (V16HImode, operands[1], V16HFmode);
+     operands[2] = lowpart_subreg (V16HImode, operands[2], V16HFmode);
+     emit_insn (gen_avx2_pblendw (tmp, operands[1], operands[2], blendw_idx));
+
+     operands[0] = lowpart_subreg (V8SImode, operands[0], V16HFmode);
+     tmp = lowpart_subreg (V8SImode, tmp, V16HImode);
+     operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);
+     emit_insn (gen_avx2_pblenddv8si (operands[0], operands[1],
+				      tmp, blendd_idx));
+  }
+
+  DONE;
+})
+
 (define_insn "*avx2_pblendw"
   [(set (match_operand:V16HI 0 "register_operand" "=x")
 	(vec_merge:V16HI
@@ -24714,10 +25415,10 @@
 (define_insn "avx512dq_fpclass<mode><mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
           (unspec:<avx512fmaskmode>
-            [(match_operand:VF_AVX512VL 1 "vector_operand" "vm")
+            [(match_operand:VFH_AVX512VL 1 "vector_operand" "vm")
              (match_operand 2 "const_0_to_255_operand" "n")]
              UNSPEC_FPCLASS))]
-   "TARGET_AVX512DQ"
+   "TARGET_AVX512DQ || VALID_AVX512FP16_REG_MODE(<MODE>mode)"
    "vfpclass<ssemodesuffix><vecmemsuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}";
   [(set_attr "type" "sse")
    (set_attr "length_immediate" "1")
@@ -24728,11 +25429,11 @@
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
 	(and:<avx512fmaskmode>
 	  (unspec:<avx512fmaskmode>
-	    [(match_operand:VF_128 1 "nonimmediate_operand" "vm")
+	    [(match_operand:VFH_128 1 "nonimmediate_operand" "vm")
              (match_operand 2 "const_0_to_255_operand" "n")]
 	    UNSPEC_FPCLASS)
 	  (const_int 1)))]
-   "TARGET_AVX512DQ"
+   "TARGET_AVX512DQ || VALID_AVX512FP16_REG_MODE(<MODE>mode)"
    "vfpclass<ssescalarmodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}";
   [(set_attr "type" "sse")
    (set_attr "length_immediate" "1")
@@ -24740,9 +25441,9 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "<avx512>_getmant<mode><mask_name><round_saeonly_name>"
-  [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
-	(unspec:VF_AVX512VL
-	  [(match_operand:VF_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>")
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
+	(unspec:VFH_AVX512VL
+	  [(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>")
 	   (match_operand:SI 2 "const_0_to_15_operand")]
 	  UNSPEC_GETMANT))]
   "TARGET_AVX512F"
@@ -24751,11 +25452,11 @@
    (set_attr "mode" "<MODE>")])
 
 (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>"
-  [(set (match_operand:VF_128 0 "register_operand" "=v")
-	(vec_merge:VF_128
-	  (unspec:VF_128
-	    [(match_operand:VF_128 1 "register_operand" "v")
-	     (match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
+  [(set (match_operand:VFH_128 0 "register_operand" "=v")
+	(vec_merge:VFH_128
+	  (unspec:VFH_128
+	    [(match_operand:VFH_128 1 "register_operand" "v")
+	     (match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
 	     (match_operand:SI 3 "const_0_to_15_operand")]
 	    UNSPEC_GETMANT)
 	  (match_dup 1)
diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
index 717561a..157d49f 100644
--- a/gcc/config/i386/subst.md
+++ b/gcc/config/i386/subst.md
@@ -153,6 +153,7 @@
 (define_subst_attr "round_mask_op4" "round" "" "<round_mask_operand4>")
 (define_subst_attr "round_sd_mask_op4" "round" "" "<round_sd_mask_operand4>")
 (define_subst_attr "round_constraint" "round" "vm" "v")
+(define_subst_attr "round_qq2phsuff" "round" "<qq2phsuff>" "")
 (define_subst_attr "bcst_round_constraint" "round" "vmBr" "v")
 (define_subst_attr "round_constraint2" "round" "m" "v")
 (define_subst_attr "round_constraint3" "round" "rm" "r")
diff --git a/gcc/config/i386/vxworks.h b/gcc/config/i386/vxworks.h
index ebda7d9..0676cb4 100644
--- a/gcc/config/i386/vxworks.h
+++ b/gcc/config/i386/vxworks.h
@@ -73,37 +73,37 @@ along with GCC; see the file COPYING3.  If not see
       VXWORKS_OS_CPP_BUILTINS ();			\
       if (TARGET_64BIT)					\
 	VX_CPUDEF (X86_64);				\
-      else if (TARGET_PENTIUM4)				\
+      else if (TARGET_CPU_P (PENTIUM4))			\
 	{						\
 	  VX_CPUDEF (PENTIUM4);				\
 	  VX_CPUVDEF (PENTIUM4);			\
 	}						\
-      else if (TARGET_CORE2)				\
+      else if (TARGET_CPU_P (CORE2))			\
 	VX_CPUDEF (CORE2);				\
-      else if (TARGET_NEHALEM)				\
+      else if (TARGET_CPU_P (NEHALEM))			\
 	VX_CPUDEF (NEHALEM);				\
-      else if (TARGET_SANDYBRIDGE)			\
+      else if (TARGET_CPU_P (SANDYBRIDGE))		\
 	VX_CPUDEF (SANDYBRIDGE);			\
-      else if (TARGET_HASWELL)				\
+      else if (TARGET_CPU_P (HASWELL))			\
 	VX_CPUDEF (HASWELL);				\
-      else if (TARGET_SILVERMONT)			\
+      else if (TARGET_CPU_P (SILVERMONT))		\
 	VX_CPUDEF (SILVERMONT);				\
-      else if (TARGET_SKYLAKE || TARGET_SKYLAKE_AVX512) \
+      else if (TARGET_CPU_P (SKYLAKE) || TARGET_CPU_P (SKYLAKE_AVX512)) \
 	VX_CPUDEF (SKYLAKE);				\
-      else if (TARGET_GOLDMONT)				\
+      else if (TARGET_CPU_P (GOLDMONT))			\
 	VX_CPUDEF (GOLDMONT);				\
       else if (TARGET_VXWORKS7)				\
 	VX_CPUDEF (PENTIUM4);				\
-      else if (TARGET_386)				\
+      else if (TARGET_CPU_P (I386))			\
 	VX_CPUDEF (I80386);				\
-      else if (TARGET_486)				\
+      else if (TARGET_CPU_P (I486))			\
 	VX_CPUDEF (I80486);				\
-      else if (TARGET_PENTIUM)				\
+      else if (TARGET_CPU_P (PENTIUM))			\
 	{						\
 	  VX_CPUDEF (PENTIUM);				\
 	  VX_CPUVDEF (PENTIUM);				\
 	}						\
-      else if (TARGET_PENTIUMPRO)			\
+      else if (TARGET_CPU_P (PENTIUMPRO))		\
 	{						\
 	  VX_CPUDEF (PENTIUM2);				\
 	  VX_CPUVDEF (PENTIUMPRO);			\
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index ffe810f..93644be 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {
   "16",					/* Func alignment.  */
 };
 
+static stringop_algs tremont_memcpy[2] = {
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
+static stringop_algs tremont_memset[2] = {
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
+static const
+struct processor_costs tremont_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,				     /* cost for loading QImode using movzbl */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {6, 6, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 12},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {6, 6},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {6, 6},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
+  {6, 6, 6, 10, 15},			/* cost of loading SSE registers
+					   in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},			/* cost of storing SSE registers
+					   in 32,64,128,256 and 512-bit */
+  6, 6,				/* SSE->integer and integer->SSE moves */
+  6, 6,				/* mask->integer and integer->mask moves */
+  {6, 6, 6},				/* cost of loading mask register
+					   in QImode, HImode, SImode.  */
+  {6, 6, 6},			/* cost if storing mask register
+					   in QImode, HImode, SImode.  */
+  2,					/* cost of moving mask register.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  /* Setting cost to 2 makes our current implementation of synth_mult result in
+     use of unnecessary temporary registers causing regression on several
+     SPECfp benchmarks.  */
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (4)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (22),			/*			    HI */
+   COSTS_N_INSNS (30),			/*			    SI */
+   COSTS_N_INSNS (74),			/*			    DI */
+   COSTS_N_INSNS (74)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+  17,					/* CLEAR_RATIO */
+  {6, 6, 6},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {6, 6, 6},				/* cost of storing integer registers */
+  {6, 6, 6, 10, 15},			/* cost of loading SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 10, 15},			/* cost of storing SSE register
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
+  {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
+  6,					/* cost of moving SSE register to integer.  */
+  18, 6,				/* Gather load static, per_elt.  */
+  18, 6,				/* Gather store static, per_elt.  */
+  32,					/* size of l1 cache.  */
+  512,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
+     value is increased to perhaps more appropriate value of 5.  */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
+  COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
+  1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  tremont_memcpy,
+  tremont_memset,
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
+  "16:11:8",				/* Loop alignment.  */
+  "16:11:8",				/* Jump alignment.  */
+  "0:0:8",				/* Label alignment.  */
+  "16",					/* Func alignment.  */
+};
+
 static stringop_algs intel_memcpy[2] = {
   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
diff --git a/gcc/config/i386/x86-tune-sched.c b/gcc/config/i386/x86-tune-sched.c
index 2e5ee4e..56ada99 100644
--- a/gcc/config/i386/x86-tune-sched.c
+++ b/gcc/config/i386/x86-tune-sched.c
@@ -71,6 +71,7 @@ ix86_issue_rate (void)
     case PROCESSOR_NEHALEM:
     case PROCESSOR_SANDYBRIDGE:
     case PROCESSOR_HASWELL:
+    case PROCESSOR_TREMONT:
     case PROCESSOR_GENERIC:
       return 4;
 
@@ -429,6 +430,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
     case PROCESSOR_NEHALEM:
     case PROCESSOR_SANDYBRIDGE:
     case PROCESSOR_HASWELL:
+    case PROCESSOR_TREMONT:
     case PROCESSOR_GENERIC:
       /* Stack engine allows to execute push&pop instructions in parall.  */
       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 2f221b1..58e8ead 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -62,6 +62,21 @@ DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
    that can be partly masked by careful scheduling of moves.  */
 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
+	  | m_BDVER | m_ZNVER | m_TREMONT | m_GENERIC)
+
+/* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids
+   partial write to the destination in scalar SSE conversion from FP
+   to FP.  */
+DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY,
+	  "sse_partial_reg_fp_converts_dependency",
+	  m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
+	  | m_BDVER | m_ZNVER | m_GENERIC)
+
+/* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial
+   write to the destination in scalar SSE conversion from integer to FP.  */
+DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
+	  "sse_partial_reg_converts_dependency",
+	  m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
 	  | m_BDVER | m_ZNVER | m_GENERIC)
 
 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
@@ -136,7 +151,7 @@ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
 
 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
 	  m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
-	  | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ATHLON_K8)
+	  | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8)
 
 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
    considered on critical path.  */
@@ -150,14 +165,15 @@ DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
 
 /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
-	  m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
+	  m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_TREMONT
+	  | m_GENERIC)
 
 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
    Some chips, like 486 and Pentium works faster with separate load
    and push instructions.  */
 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
-          | m_GENERIC)
+          | m_TREMONT | m_GENERIC)
 
 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
    over esp subtraction.  */
@@ -198,8 +214,7 @@ DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
    than 4 branch instructions in the 16 byte window.  */
 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM
-	  | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL | m_ATHLON_K8
-	  | m_AMDFAM10)
+	  | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
 
 /*****************************************************************************/
 /* Integer instruction selection tuning                                      */
@@ -240,11 +255,11 @@ DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
    will impact LEA instruction selection. */
 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
-	 | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL)
+	 | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
 
 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
-	  m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
+	  m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS
 	  | m_KNL | m_KNM)
 
 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
@@ -263,7 +278,7 @@ DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
    a conditional move.  */
 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
 	  m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL
-	  | m_KNM | m_TREMONT | m_INTEL)
+	  | m_KNM | m_INTEL)
 
 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
    as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
@@ -273,7 +288,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
    move/set sequences of bytes with known size.  */
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
 	  "prefer_known_rep_movsb_stosb",
-	  m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
+	  m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
 
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
    compact prologues and epilogues by issuing a misaligned moves.  This
@@ -282,7 +297,8 @@ DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
    FIXME: This may actualy be a win on more targets than listed here.  */
 DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
 	  "misaligned_move_string_pro_epilogues",
-	  m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
+	  m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_TREMONT
+	  | m_GENERIC)
 
 /* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
@@ -294,7 +310,7 @@ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
 	  ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
-	    | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT))
+	    | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS))
 
 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
@@ -305,7 +321,7 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
    for bit-manipulation instructions.  */
 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
-	  m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
+	  m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_GENERIC)
 
 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
@@ -321,14 +337,14 @@ DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
 
 /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence.  */
 DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
-	 m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC)
+	 m_CORE_ALL | m_BDVER | m_ZNVER | m_TREMONT | m_GENERIC)
 
 /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
    generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
    (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions.  */
 DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
 	  m_CORE_ALL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
-	  | m_GOLDMONT_PLUS | m_TREMONT )
+	  | m_GOLDMONT_PLUS)
 
 /*****************************************************************************/
 /* 387 instruction selection tuning                                          */
@@ -386,13 +402,13 @@ DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optim
 
 /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
-	  m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
+	  m_AMD_MULTIPLE | m_CORE_ALL | m_TREMONT | m_GENERIC)
 
 /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
    xorps/xorpd and other variants.  */
 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
 	  m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
-	  | m_GENERIC)
+	  | m_TREMONT | m_GENERIC)
 
 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
    to SSE registers.  If disabled, the moves will be done by storing
@@ -419,7 +435,7 @@ DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
    fp converts to destination register.  */
 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
 	  m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS
-	  | m_TREMONT | m_INTEL)
+	  | m_INTEL)
 
 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
    from FP to FP.  This form of instructions avoids partial write to the
@@ -434,7 +450,7 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
 	  m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
-	  | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL)
+	  | m_GOLDMONT_PLUS | m_INTEL)
 
 /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes.  */
 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
diff --git a/gcc/config/mips/netbsd.h b/gcc/config/mips/netbsd.h
index 85c2779..1c6a59d 100644
--- a/gcc/config/mips/netbsd.h
+++ b/gcc/config/mips/netbsd.h
@@ -87,7 +87,7 @@ along with GCC; see the file COPYING3.  If not see
       else if (mips_isa >= MIPS_ISA_MIPS32			\
 	       && mips_isa < MIPS_ISA_MIPS64)			\
 	builtin_define ("__mips=32");				\
-      else if (mips_isa >= MIPS_ISA_64)				\
+      else if (mips_isa >= MIPS_ISA_MIPS64)			\
 	builtin_define ("__mips=64");				\
       if (mips_isa_rev > 0)					\
         builtin_define_with_int_value ("__mips_isa_rev",	\
diff --git a/gcc/config/rs6000/lynx.h b/gcc/config/rs6000/lynx.h
index 3434c8b..0ddb54f 100644
--- a/gcc/config/rs6000/lynx.h
+++ b/gcc/config/rs6000/lynx.h
@@ -80,7 +80,6 @@
 
 #undef SIZE_TYPE
 #undef ASM_OUTPUT_ALIGN
-#undef PREFERRED_DEBUGGING_TYPE
 
 /* The file rs6000.c defines TARGET_HAVE_TLS unconditionally to the
    value of HAVE_AS_TLS.  HAVE_AS_TLS is true as gas support for TLS
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 1f6fc03..1990a21 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -91,7 +91,10 @@
    UNSPEC_MMA_XVI8GER4SPP
    UNSPEC_MMA_XXMFACC
    UNSPEC_MMA_XXMTACC
-   UNSPEC_MMA_XXSETACCZ
+  ])
+
+(define_c_enum "unspecv"
+  [UNSPECV_MMA_XXSETACCZ
   ])
 
 ;; MMA instructions with 1 accumulator argument
@@ -467,30 +470,16 @@
   "<acc> %A0"
   [(set_attr "type" "mma")])
 
-;; We can't have integer constants in XOmode so we wrap this in an UNSPEC.
-
-(define_expand "mma_xxsetaccz"
-  [(set (match_operand:XO 0 "fpr_reg_operand")
-	(const_int 0))]
-  "TARGET_MMA"
-{
-  rtx xo0 = gen_rtx_UNSPEC (XOmode, gen_rtvec (1, const0_rtx),
-			    UNSPEC_MMA_XXSETACCZ);
-  emit_insn (gen_rtx_SET (operands[0], xo0));
-  DONE;
-})
+;; We can't have integer constants in XOmode so we wrap this in an
+;; UNSPEC_VOLATILE.
 
-(define_insn_and_split "*mma_xxsetaccz"
+(define_insn "mma_xxsetaccz"
   [(set (match_operand:XO 0 "fpr_reg_operand" "=d")
-	(unspec:XO [(match_operand 1 "const_0_to_1_operand" "O")]
-	 UNSPEC_MMA_XXSETACCZ))]
+	(unspec_volatile:XO [(const_int 0)]
+			    UNSPECV_MMA_XXSETACCZ))]
   "TARGET_MMA"
   "xxsetaccz %A0"
-  "&& reload_completed"
-  [(set (match_dup 0) (unspec:XO [(match_dup 1)] UNSPEC_MMA_XXSETACCZ))]
-  ""
-  [(set_attr "type" "mma")
-   (set_attr "length" "4")])
+  [(set_attr "type" "mma")])
 
 (define_insn "mma_<vv>"
   [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def
index 6a28d51..a8c6b9e 100644
--- a/gcc/config/rs6000/rs6000-builtin-new.def
+++ b/gcc/config/rs6000/rs6000-builtin-new.def
@@ -208,6 +208,12 @@
   double __builtin_mffs ();
     MFFS rs6000_mffs {}
 
+; Although the mffsl instruction is only available on POWER9 and later
+; processors, this builtin automatically falls back to mffs on older
+; platforms.  Thus it appears here in the [always] stanza.
+  double __builtin_mffsl ();
+    MFFSL rs6000_mffsl {}
+
 ; This thing really assumes long double == __ibm128, and I'm told it has
 ; been used as such within libgcc.  Given that __builtin_pack_ibm128
 ; exists for the same purpose, this should really not be used at all.
@@ -2784,9 +2790,6 @@
   signed long long __builtin_darn_raw ();
     DARN_RAW darn_raw {}
 
-  double __builtin_mffsl ();
-    MFFSL rs6000_mffsl {}
-
   const signed int __builtin_dtstsfi_eq_dd (const int<6>, _Decimal64);
     TSTSFI_EQ_DD dfptstsfi_eq_dd {}
 
diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index afcb5bb..d08bdfe 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -35,6 +35,9 @@
 #include "langhooks.h"
 #include "c/c-tree.h"
 
+#include "rs6000-builtins.h"
+
+static tree altivec_resolve_new_overloaded_builtin (location_t, tree, void *);
 
 
 /* Handle the machine specific pragma longcall.  Its syntax is
@@ -811,6 +814,32 @@ is_float128_p (tree t)
 	      && t == long_double_type_node));
 }
   
+
+/* Return true iff ARGTYPE can be compatibly passed as PARMTYPE.  */
+static bool
+rs6000_new_builtin_type_compatible (tree parmtype, tree argtype)
+{
+  if (parmtype == error_mark_node)
+    return false;
+
+  if (INTEGRAL_TYPE_P (parmtype) && INTEGRAL_TYPE_P (argtype))
+    return true;
+
+  if (TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128
+      && is_float128_p (parmtype) && is_float128_p (argtype))
+    return true;
+
+  if (POINTER_TYPE_P (parmtype) && POINTER_TYPE_P (argtype))
+    {
+      parmtype = TREE_TYPE (parmtype);
+      argtype = TREE_TYPE (argtype);
+      if (TYPE_READONLY (argtype))
+	parmtype = build_qualified_type (parmtype, TYPE_QUAL_CONST);
+    }
+
+  return lang_hooks.types_compatible_p (parmtype, argtype);
+}
+
 static inline bool
 rs6000_builtin_type_compatible (tree t, int id)
 {
@@ -927,6 +956,10 @@ tree
 altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
 				    void *passed_arglist)
 {
+  if (new_builtins_are_live)
+    return altivec_resolve_new_overloaded_builtin (loc, fndecl,
+						   passed_arglist);
+
   vec<tree, va_gc> *arglist = static_cast<vec<tree, va_gc> *> (passed_arglist);
   unsigned int nargs = vec_safe_length (arglist);
   enum rs6000_builtins fcode
@@ -1930,3 +1963,1048 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
     return error_mark_node;
   }
 }
+
+/* Build a tree for a function call to an Altivec non-overloaded builtin.
+   The overloaded builtin that matched the types and args is described
+   by DESC.  The N arguments are given in ARGS, respectively.
+
+   Actually the only thing it does is calling fold_convert on ARGS, with
+   a small exception for vec_{all,any}_{ge,le} predicates. */
+
+static tree
+altivec_build_new_resolved_builtin (tree *args, int n, tree fntype,
+				    tree ret_type,
+				    rs6000_gen_builtins bif_id,
+				    rs6000_gen_builtins ovld_id)
+{
+  tree argtypes = TYPE_ARG_TYPES (fntype);
+  tree arg_type[MAX_OVLD_ARGS];
+  tree fndecl = rs6000_builtin_decls_x[bif_id];
+
+  for (int i = 0; i < n; i++)
+    {
+      arg_type[i] = TREE_VALUE (argtypes);
+      argtypes = TREE_CHAIN (argtypes);
+    }
+
+  /* The AltiVec overloading implementation is overall gross, but this
+     is particularly disgusting.  The vec_{all,any}_{ge,le} builtins
+     are completely different for floating-point vs. integer vector
+     types, because the former has vcmpgefp, but the latter should use
+     vcmpgtXX.
+
+     In practice, the second and third arguments are swapped, and the
+     condition (LT vs. EQ, which is recognizable by bit 1 of the first
+     argument) is reversed.  Patch the arguments here before building
+     the resolved CALL_EXPR.  */
+  if (n == 3
+      && ovld_id == RS6000_OVLD_VEC_CMPGE_P
+      && bif_id != RS6000_BIF_VCMPGEFP_P
+      && bif_id != RS6000_BIF_XVCMPGEDP_P)
+    {
+      std::swap (args[1], args[2]);
+      std::swap (arg_type[1], arg_type[2]);
+
+      args[0] = fold_build2 (BIT_XOR_EXPR, TREE_TYPE (args[0]), args[0],
+			     build_int_cst (NULL_TREE, 2));
+    }
+
+  for (int j = 0; j < n; j++)
+    args[j] = fully_fold_convert (arg_type[j], args[j]);
+
+  /* If the number of arguments to an overloaded function increases,
+     we must expand this switch.  */
+  gcc_assert (MAX_OVLD_ARGS <= 4);
+
+  tree call;
+  switch (n)
+    {
+    case 0:
+      call = build_call_expr (fndecl, 0);
+      break;
+    case 1:
+      call = build_call_expr (fndecl, 1, args[0]);
+      break;
+    case 2:
+      call = build_call_expr (fndecl, 2, args[0], args[1]);
+      break;
+    case 3:
+      call = build_call_expr (fndecl, 3, args[0], args[1], args[2]);
+      break;
+    case 4:
+      call = build_call_expr (fndecl, 4, args[0], args[1], args[2], args[3]);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  return fold_convert (ret_type, call);
+}
+
+/* Implementation of the resolve_overloaded_builtin target hook, to
+   support Altivec's overloaded builtins.  FIXME: This code needs
+   to be brutally factored.  */
+
+static tree
+altivec_resolve_new_overloaded_builtin (location_t loc, tree fndecl,
+					void *passed_arglist)
+{
+  vec<tree, va_gc> *arglist = static_cast<vec<tree, va_gc> *> (passed_arglist);
+  unsigned int nargs = vec_safe_length (arglist);
+  enum rs6000_gen_builtins fcode
+    = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+  tree fnargs = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
+  tree types[MAX_OVLD_ARGS];
+  tree args[MAX_OVLD_ARGS];
+
+  /* Return immediately if this isn't an overload.  */
+  if (fcode <= RS6000_OVLD_NONE)
+    return NULL_TREE;
+
+  unsigned int adj_fcode = fcode - RS6000_OVLD_NONE;
+
+  if (TARGET_DEBUG_BUILTIN)
+    fprintf (stderr, "altivec_resolve_overloaded_builtin, code = %4d, %s\n",
+	     (int) fcode, IDENTIFIER_POINTER (DECL_NAME (fndecl)));
+
+  /* vec_lvsl and vec_lvsr are deprecated for use with LE element order.  */
+  if (fcode == RS6000_OVLD_VEC_LVSL && !BYTES_BIG_ENDIAN)
+    warning (OPT_Wdeprecated,
+	     "%<vec_lvsl%> is deprecated for little endian; use "
+	     "assignment for unaligned loads and stores");
+  else if (fcode == RS6000_OVLD_VEC_LVSR && !BYTES_BIG_ENDIAN)
+    warning (OPT_Wdeprecated,
+	     "%<vec_lvsr%> is deprecated for little endian; use "
+	     "assignment for unaligned loads and stores");
+
+  if (fcode == RS6000_OVLD_VEC_MUL)
+    {
+      /* vec_mul needs to be special cased because there are no instructions
+	 for it for the {un}signed char, {un}signed short, and {un}signed int
+	 types.  */
+      if (nargs != 2)
+	{
+	  error ("builtin %qs only accepts 2 arguments", "vec_mul");
+	  return error_mark_node;
+	}
+
+      tree arg0 = (*arglist)[0];
+      tree arg0_type = TREE_TYPE (arg0);
+      tree arg1 = (*arglist)[1];
+      tree arg1_type = TREE_TYPE (arg1);
+
+      /* Both arguments must be vectors and the types must be compatible.  */
+      if (TREE_CODE (arg0_type) != VECTOR_TYPE)
+	goto bad;
+      if (!lang_hooks.types_compatible_p (arg0_type, arg1_type))
+	goto bad;
+
+      switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+	{
+	  case E_QImode:
+	  case E_HImode:
+	  case E_SImode:
+	  case E_DImode:
+	  case E_TImode:
+	    {
+	      /* For scalar types just use a multiply expression.  */
+	      return fold_build2_loc (loc, MULT_EXPR, TREE_TYPE (arg0), arg0,
+				      fold_convert (TREE_TYPE (arg0), arg1));
+	    }
+	  case E_SFmode:
+	    {
+	      /* For floats use the xvmulsp instruction directly.  */
+	      tree call = rs6000_builtin_decls_x[RS6000_BIF_XVMULSP];
+	      return build_call_expr (call, 2, arg0, arg1);
+	    }
+	  case E_DFmode:
+	    {
+	      /* For doubles use the xvmuldp instruction directly.  */
+	      tree call = rs6000_builtin_decls_x[RS6000_BIF_XVMULDP];
+	      return build_call_expr (call, 2, arg0, arg1);
+	    }
+	  /* Other types are errors.  */
+	  default:
+	    goto bad;
+	}
+    }
+
+  if (fcode == RS6000_OVLD_VEC_CMPNE)
+    {
+      /* vec_cmpne needs to be special cased because there are no instructions
+	 for it (prior to power 9).  */
+      if (nargs != 2)
+	{
+	  error ("builtin %qs only accepts 2 arguments", "vec_cmpne");
+	  return error_mark_node;
+	}
+
+      tree arg0 = (*arglist)[0];
+      tree arg0_type = TREE_TYPE (arg0);
+      tree arg1 = (*arglist)[1];
+      tree arg1_type = TREE_TYPE (arg1);
+
+      /* Both arguments must be vectors and the types must be compatible.  */
+      if (TREE_CODE (arg0_type) != VECTOR_TYPE)
+	goto bad;
+      if (!lang_hooks.types_compatible_p (arg0_type, arg1_type))
+	goto bad;
+
+      /* Power9 instructions provide the most efficient implementation of
+	 ALTIVEC_BUILTIN_VEC_CMPNE if the mode is not DImode or TImode
+	 or SFmode or DFmode.  */
+      if (!TARGET_P9_VECTOR
+	  || (TYPE_MODE (TREE_TYPE (arg0_type)) == DImode)
+	  || (TYPE_MODE (TREE_TYPE (arg0_type)) == TImode)
+	  || (TYPE_MODE (TREE_TYPE (arg0_type)) == SFmode)
+	  || (TYPE_MODE (TREE_TYPE (arg0_type)) == DFmode))
+	{
+	  switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+	    {
+	      /* vec_cmpneq (va, vb) == vec_nor (vec_cmpeq (va, vb),
+		 vec_cmpeq (va, vb)).  */
+	      /* Note:  vec_nand also works but opt changes vec_nand's
+		 to vec_nor's anyway.  */
+	    case E_QImode:
+	    case E_HImode:
+	    case E_SImode:
+	    case E_DImode:
+	    case E_TImode:
+	    case E_SFmode:
+	    case E_DFmode:
+	      {
+		/* call = vec_cmpeq (va, vb)
+		   result = vec_nor (call, call).  */
+		vec<tree, va_gc> *params = make_tree_vector ();
+		vec_safe_push (params, arg0);
+		vec_safe_push (params, arg1);
+		tree call = altivec_resolve_new_overloaded_builtin
+		  (loc, rs6000_builtin_decls_x[RS6000_OVLD_VEC_CMPEQ],
+		   params);
+		/* Use save_expr to ensure that operands used more than once
+		   that may have side effects (like calls) are only evaluated
+		   once.  */
+		call = save_expr (call);
+		params = make_tree_vector ();
+		vec_safe_push (params, call);
+		vec_safe_push (params, call);
+		return altivec_resolve_new_overloaded_builtin
+		  (loc, rs6000_builtin_decls_x[RS6000_OVLD_VEC_NOR], params);
+	      }
+	      /* Other types are errors.  */
+	    default:
+	      goto bad;
+	    }
+	}
+      /* else, fall through and process the Power9 alternative below */
+    }
+
+  if (fcode == RS6000_OVLD_VEC_ADDE || fcode == RS6000_OVLD_VEC_SUBE)
+    {
+      /* vec_adde needs to be special cased because there is no instruction
+	  for the {un}signed int version.  */
+      if (nargs != 3)
+	{
+	  const char *name;
+	  name = fcode == RS6000_OVLD_VEC_ADDE ? "vec_adde" : "vec_sube";
+	  error ("builtin %qs only accepts 3 arguments", name);
+	  return error_mark_node;
+	}
+
+      tree arg0 = (*arglist)[0];
+      tree arg0_type = TREE_TYPE (arg0);
+      tree arg1 = (*arglist)[1];
+      tree arg1_type = TREE_TYPE (arg1);
+      tree arg2 = (*arglist)[2];
+      tree arg2_type = TREE_TYPE (arg2);
+
+      /* All 3 arguments must be vectors of (signed or unsigned) (int or
+	 __int128) and the types must be compatible.  */
+      if (TREE_CODE (arg0_type) != VECTOR_TYPE)
+	goto bad;
+      if (!lang_hooks.types_compatible_p (arg0_type, arg1_type)
+	  || !lang_hooks.types_compatible_p (arg1_type, arg2_type))
+	goto bad;
+
+      switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+	{
+	  /* For {un}signed ints,
+	     vec_adde (va, vb, carryv) == vec_add (vec_add (va, vb),
+						   vec_and (carryv, 1)).
+	     vec_sube (va, vb, carryv) == vec_sub (vec_sub (va, vb),
+						   vec_and (carryv, 1)).  */
+	  case E_SImode:
+	    {
+	      tree add_sub_builtin;
+
+	      vec<tree, va_gc> *params = make_tree_vector ();
+	      vec_safe_push (params, arg0);
+	      vec_safe_push (params, arg1);
+
+	      if (fcode == RS6000_OVLD_VEC_ADDE)
+		add_sub_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_ADD];
+	      else
+		add_sub_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_SUB];
+
+	      tree call
+		= altivec_resolve_new_overloaded_builtin (loc,
+							  add_sub_builtin,
+							  params);
+	      tree const1 = build_int_cstu (TREE_TYPE (arg0_type), 1);
+	      tree ones_vector = build_vector_from_val (arg0_type, const1);
+	      tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, arg0_type,
+					       arg2, ones_vector);
+	      params = make_tree_vector ();
+	      vec_safe_push (params, call);
+	      vec_safe_push (params, and_expr);
+	      return altivec_resolve_new_overloaded_builtin (loc,
+							     add_sub_builtin,
+							     params);
+	    }
+	  /* For {un}signed __int128s use the vaddeuqm/vsubeuqm instruction
+	     directly.  */
+	  case E_TImode:
+	    break;
+
+	  /* Types other than {un}signed int and {un}signed __int128
+		are errors.  */
+	  default:
+	    goto bad;
+	}
+    }
+
+  if (fcode == RS6000_OVLD_VEC_ADDEC || fcode == RS6000_OVLD_VEC_SUBEC)
+    {
+      /* vec_addec and vec_subec needs to be special cased because there is
+	 no instruction for the {un}signed int version.  */
+      if (nargs != 3)
+	{
+	  const char *name;
+	  name = fcode == RS6000_OVLD_VEC_ADDEC ? "vec_addec" : "vec_subec";
+	  error ("builtin %qs only accepts 3 arguments", name);
+	  return error_mark_node;
+	}
+
+      tree arg0 = (*arglist)[0];
+      tree arg0_type = TREE_TYPE (arg0);
+      tree arg1 = (*arglist)[1];
+      tree arg1_type = TREE_TYPE (arg1);
+      tree arg2 = (*arglist)[2];
+      tree arg2_type = TREE_TYPE (arg2);
+
+      /* All 3 arguments must be vectors of (signed or unsigned) (int or
+	 __int128) and the types must be compatible.  */
+      if (TREE_CODE (arg0_type) != VECTOR_TYPE)
+	goto bad;
+      if (!lang_hooks.types_compatible_p (arg0_type, arg1_type)
+	  || !lang_hooks.types_compatible_p (arg1_type, arg2_type))
+	goto bad;
+
+      switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+	{
+	  /* For {un}signed ints,
+	      vec_addec (va, vb, carryv) ==
+				vec_or (vec_addc (va, vb),
+					vec_addc (vec_add (va, vb),
+						  vec_and (carryv, 0x1))).  */
+	  case E_SImode:
+	    {
+	    /* Use save_expr to ensure that operands used more than once
+		that may have side effects (like calls) are only evaluated
+		once.  */
+	    tree as_builtin;
+	    tree as_c_builtin;
+
+	    arg0 = save_expr (arg0);
+	    arg1 = save_expr (arg1);
+	    vec<tree, va_gc> *params = make_tree_vector ();
+	    vec_safe_push (params, arg0);
+	    vec_safe_push (params, arg1);
+
+	    if (fcode == RS6000_OVLD_VEC_ADDEC)
+	      as_c_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_ADDC];
+	    else
+	      as_c_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_SUBC];
+
+	    tree call1 = altivec_resolve_new_overloaded_builtin (loc,
+								 as_c_builtin,
+								 params);
+	    params = make_tree_vector ();
+	    vec_safe_push (params, arg0);
+	    vec_safe_push (params, arg1);
+
+	    if (fcode == RS6000_OVLD_VEC_ADDEC)
+	      as_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_ADD];
+	    else
+	      as_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_SUB];
+
+	    tree call2 = altivec_resolve_new_overloaded_builtin (loc,
+								 as_builtin,
+								 params);
+	    tree const1 = build_int_cstu (TREE_TYPE (arg0_type), 1);
+	    tree ones_vector = build_vector_from_val (arg0_type, const1);
+	    tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, arg0_type,
+					     arg2, ones_vector);
+	    params = make_tree_vector ();
+	    vec_safe_push (params, call2);
+	    vec_safe_push (params, and_expr);
+	    call2 = altivec_resolve_new_overloaded_builtin (loc, as_c_builtin,
+							    params);
+	    params = make_tree_vector ();
+	    vec_safe_push (params, call1);
+	    vec_safe_push (params, call2);
+	    tree or_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_OR];
+	    return altivec_resolve_new_overloaded_builtin (loc, or_builtin,
+							   params);
+	    }
+	  /* For {un}signed __int128s use the vaddecuq/vsubbecuq
+	     instructions.  This occurs through normal processing.  */
+	  case E_TImode:
+	    break;
+
+	  /* Types other than {un}signed int and {un}signed __int128
+		are errors.  */
+	  default:
+	    goto bad;
+	}
+    }
+
+  /* For now treat vec_splats and vec_promote as the same.  */
+  if (fcode == RS6000_OVLD_VEC_SPLATS || fcode == RS6000_OVLD_VEC_PROMOTE)
+    {
+      tree type, arg;
+      int size;
+      int i;
+      bool unsigned_p;
+      vec<constructor_elt, va_gc> *vec;
+      const char *name;
+      name = fcode == RS6000_OVLD_VEC_SPLATS ? "vec_splats" : "vec_promote";
+
+      if (fcode == RS6000_OVLD_VEC_SPLATS && nargs != 1)
+	{
+	  error ("builtin %qs only accepts 1 argument", name);
+	  return error_mark_node;
+	}
+      if (fcode == RS6000_OVLD_VEC_PROMOTE && nargs != 2)
+	{
+	  error ("builtin %qs only accepts 2 arguments", name);
+	  return error_mark_node;
+	}
+      /* Ignore promote's element argument.  */
+      if (fcode == RS6000_OVLD_VEC_PROMOTE
+	  && !INTEGRAL_TYPE_P (TREE_TYPE ((*arglist)[1])))
+	goto bad;
+
+      arg = (*arglist)[0];
+      type = TREE_TYPE (arg);
+      if (!SCALAR_FLOAT_TYPE_P (type)
+	  && !INTEGRAL_TYPE_P (type))
+	goto bad;
+      unsigned_p = TYPE_UNSIGNED (type);
+      switch (TYPE_MODE (type))
+	{
+	  case E_TImode:
+	    type = unsigned_p ? unsigned_V1TI_type_node : V1TI_type_node;
+	    size = 1;
+	    break;
+	  case E_DImode:
+	    type = unsigned_p ? unsigned_V2DI_type_node : V2DI_type_node;
+	    size = 2;
+	    break;
+	  case E_SImode:
+	    type = unsigned_p ? unsigned_V4SI_type_node : V4SI_type_node;
+	    size = 4;
+	    break;
+	  case E_HImode:
+	    type = unsigned_p ? unsigned_V8HI_type_node : V8HI_type_node;
+	    size = 8;
+	    break;
+	  case E_QImode:
+	    type = unsigned_p ? unsigned_V16QI_type_node : V16QI_type_node;
+	    size = 16;
+	    break;
+	  case E_SFmode:
+	    type = V4SF_type_node;
+	    size = 4;
+	    break;
+	  case E_DFmode:
+	    type = V2DF_type_node;
+	    size = 2;
+	    break;
+	  default:
+	    goto bad;
+	}
+      arg = save_expr (fold_convert (TREE_TYPE (type), arg));
+      vec_alloc (vec, size);
+      for (i = 0; i < size; i++)
+	{
+	  constructor_elt elt = {NULL_TREE, arg};
+	  vec->quick_push (elt);
+	}
+      return build_constructor (type, vec);
+    }
+
+  /* For now use pointer tricks to do the extraction, unless we are on VSX
+     extracting a double from a constant offset.  */
+  if (fcode == RS6000_OVLD_VEC_EXTRACT)
+    {
+      tree arg1;
+      tree arg1_type;
+      tree arg2;
+      tree arg1_inner_type;
+      tree decl, stmt;
+      tree innerptrtype;
+      machine_mode mode;
+
+      /* No second argument. */
+      if (nargs != 2)
+	{
+	  error ("builtin %qs only accepts 2 arguments", "vec_extract");
+	  return error_mark_node;
+	}
+
+      arg2 = (*arglist)[1];
+      arg1 = (*arglist)[0];
+      arg1_type = TREE_TYPE (arg1);
+
+      if (TREE_CODE (arg1_type) != VECTOR_TYPE)
+	goto bad;
+      if (!INTEGRAL_TYPE_P (TREE_TYPE (arg2)))
+	goto bad;
+
+      /* See if we can optimize vec_extracts with the current VSX instruction
+	 set.  */
+      mode = TYPE_MODE (arg1_type);
+      if (VECTOR_MEM_VSX_P (mode))
+
+	{
+	  tree call = NULL_TREE;
+	  int nunits = GET_MODE_NUNITS (mode);
+
+	  arg2 = fold_for_warn (arg2);
+
+	  /* If the second argument is an integer constant, generate
+	     the built-in code if we can.  We need 64-bit and direct
+	     move to extract the small integer vectors.  */
+	  if (TREE_CODE (arg2) == INTEGER_CST)
+	    {
+	      wide_int selector = wi::to_wide (arg2);
+	      selector = wi::umod_trunc (selector, nunits);
+	      arg2 = wide_int_to_tree (TREE_TYPE (arg2), selector);
+	      switch (mode)
+		{
+		default:
+		  break;
+
+		case E_V1TImode:
+		  call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V1TI];
+		  break;
+
+		case E_V2DFmode:
+		  call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DF];
+		  break;
+
+		case E_V2DImode:
+		  call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DI];
+		  break;
+
+		case E_V4SFmode:
+		  call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SF];
+		  break;
+
+		case E_V4SImode:
+		  if (TARGET_DIRECT_MOVE_64BIT)
+		    call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SI];
+		  break;
+
+		case E_V8HImode:
+		  if (TARGET_DIRECT_MOVE_64BIT)
+		    call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V8HI];
+		  break;
+
+		case E_V16QImode:
+		  if (TARGET_DIRECT_MOVE_64BIT)
+		    call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V16QI];
+		  break;
+		}
+	    }
+
+	  /* If the second argument is variable, we can optimize it if we are
+	     generating 64-bit code on a machine with direct move.  */
+	  else if (TREE_CODE (arg2) != INTEGER_CST && TARGET_DIRECT_MOVE_64BIT)
+	    {
+	      switch (mode)
+		{
+		default:
+		  break;
+
+		case E_V2DFmode:
+		  call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DF];
+		  break;
+
+		case E_V2DImode:
+		  call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DI];
+		  break;
+
+		case E_V4SFmode:
+		  call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SF];
+		  break;
+
+		case E_V4SImode:
+		  call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SI];
+		  break;
+
+		case E_V8HImode:
+		  call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V8HI];
+		  break;
+
+		case E_V16QImode:
+		  call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V16QI];
+		  break;
+		}
+	    }
+
+	  if (call)
+	    {
+	      tree result = build_call_expr (call, 2, arg1, arg2);
+	      /* Coerce the result to vector element type.  May be no-op.  */
+	      arg1_inner_type = TREE_TYPE (arg1_type);
+	      result = fold_convert (arg1_inner_type, result);
+	      return result;
+	    }
+	}
+
+      /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2). */
+      arg1_inner_type = TREE_TYPE (arg1_type);
+      tree subp = build_int_cst (TREE_TYPE (arg2),
+				 TYPE_VECTOR_SUBPARTS (arg1_type) - 1);
+      arg2 = build_binary_op (loc, BIT_AND_EXPR, arg2, subp, 0);
+      decl = build_decl (loc, VAR_DECL, NULL_TREE, arg1_type);
+      DECL_EXTERNAL (decl) = 0;
+      TREE_PUBLIC (decl) = 0;
+      DECL_CONTEXT (decl) = current_function_decl;
+      TREE_USED (decl) = 1;
+      TREE_TYPE (decl) = arg1_type;
+      TREE_READONLY (decl) = TYPE_READONLY (arg1_type);
+      if (c_dialect_cxx ())
+	{
+	  stmt = build4 (TARGET_EXPR, arg1_type, decl, arg1,
+			 NULL_TREE, NULL_TREE);
+	  SET_EXPR_LOCATION (stmt, loc);
+	}
+      else
+	{
+	  DECL_INITIAL (decl) = arg1;
+	  stmt = build1 (DECL_EXPR, arg1_type, decl);
+	  TREE_ADDRESSABLE (decl) = 1;
+	  SET_EXPR_LOCATION (stmt, loc);
+	  stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
+	}
+
+      innerptrtype = build_pointer_type (arg1_inner_type);
+
+      stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
+      stmt = convert (innerptrtype, stmt);
+      stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
+      stmt = build_indirect_ref (loc, stmt, RO_NULL);
+
+      /* PR83660: We mark this as having side effects so that
+	 downstream in fold_build_cleanup_point_expr () it will get a
+	 CLEANUP_POINT_EXPR.  If it does not we can run into an ICE
+	 later in gimplify_cleanup_point_expr ().  Potentially this
+	 causes missed optimization because there actually is no side
+	 effect.  */
+      if (c_dialect_cxx ())
+	TREE_SIDE_EFFECTS (stmt) = 1;
+
+      return stmt;
+    }
+
+  /* For now use pointer tricks to do the insertion, unless we are on VSX
+     inserting a double to a constant offset.  */
+  if (fcode == RS6000_OVLD_VEC_INSERT)
+    {
+      tree arg0;
+      tree arg1;
+      tree arg2;
+      tree arg1_type;
+      tree decl, stmt;
+      machine_mode mode;
+
+      /* No second or third arguments. */
+      if (nargs != 3)
+	{
+	  error ("builtin %qs only accepts 3 arguments", "vec_insert");
+	  return error_mark_node;
+	}
+
+      arg0 = (*arglist)[0];
+      arg1 = (*arglist)[1];
+      arg1_type = TREE_TYPE (arg1);
+      arg2 = fold_for_warn ((*arglist)[2]);
+
+      if (TREE_CODE (arg1_type) != VECTOR_TYPE)
+	goto bad;
+      if (!INTEGRAL_TYPE_P (TREE_TYPE (arg2)))
+	goto bad;
+
+      /* If we can use the VSX xxpermdi instruction, use that for insert.  */
+      mode = TYPE_MODE (arg1_type);
+      if ((mode == V2DFmode || mode == V2DImode) && VECTOR_UNIT_VSX_P (mode)
+	  && TREE_CODE (arg2) == INTEGER_CST)
+	{
+	  wide_int selector = wi::to_wide (arg2);
+	  selector = wi::umod_trunc (selector, 2);
+	  tree call = NULL_TREE;
+
+	  arg2 = wide_int_to_tree (TREE_TYPE (arg2), selector);
+	  if (mode == V2DFmode)
+	    call = rs6000_builtin_decls_x[RS6000_BIF_VEC_SET_V2DF];
+	  else if (mode == V2DImode)
+	    call = rs6000_builtin_decls_x[RS6000_BIF_VEC_SET_V2DI];
+
+	  /* Note, __builtin_vec_insert_<xxx> has vector and scalar types
+	     reversed.  */
+	  if (call)
+	    return build_call_expr (call, 3, arg1, arg0, arg2);
+	}
+      else if (mode == V1TImode && VECTOR_UNIT_VSX_P (mode)
+	       && TREE_CODE (arg2) == INTEGER_CST)
+	{
+	  tree call = rs6000_builtin_decls_x[RS6000_BIF_VEC_SET_V1TI];
+	  wide_int selector = wi::zero(32);
+
+	  arg2 = wide_int_to_tree (TREE_TYPE (arg2), selector);
+	  /* Note, __builtin_vec_insert_<xxx> has vector and scalar types
+	     reversed.  */
+	  return build_call_expr (call, 3, arg1, arg0, arg2);
+	}
+
+      /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0 with
+	 VIEW_CONVERT_EXPR.  i.e.:
+	 D.3192 = v1;
+	 _1 = n & 3;
+	 VIEW_CONVERT_EXPR<int[4]>(D.3192)[_1] = i;
+	 v1 = D.3192;
+	 D.3194 = v1;  */
+      if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1)
+	arg2 = build_int_cst (TREE_TYPE (arg2), 0);
+      else
+	arg2 = build_binary_op (loc, BIT_AND_EXPR, arg2,
+				build_int_cst (TREE_TYPE (arg2),
+					       TYPE_VECTOR_SUBPARTS (arg1_type)
+					       - 1), 0);
+      decl = build_decl (loc, VAR_DECL, NULL_TREE, arg1_type);
+      DECL_EXTERNAL (decl) = 0;
+      TREE_PUBLIC (decl) = 0;
+      DECL_CONTEXT (decl) = current_function_decl;
+      TREE_USED (decl) = 1;
+      TREE_TYPE (decl) = arg1_type;
+      TREE_READONLY (decl) = TYPE_READONLY (arg1_type);
+      TREE_ADDRESSABLE (decl) = 1;
+      if (c_dialect_cxx ())
+	{
+	  stmt = build4 (TARGET_EXPR, arg1_type, decl, arg1,
+			 NULL_TREE, NULL_TREE);
+	  SET_EXPR_LOCATION (stmt, loc);
+	}
+      else
+	{
+	  DECL_INITIAL (decl) = arg1;
+	  stmt = build1 (DECL_EXPR, arg1_type, decl);
+	  SET_EXPR_LOCATION (stmt, loc);
+	  stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
+	}
+
+      if (TARGET_VSX)
+	{
+	  stmt = build_array_ref (loc, stmt, arg2);
+	  stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt,
+			      convert (TREE_TYPE (stmt), arg0));
+	  stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
+	}
+      else
+	{
+	  tree arg1_inner_type;
+	  tree innerptrtype;
+	  arg1_inner_type = TREE_TYPE (arg1_type);
+	  innerptrtype = build_pointer_type (arg1_inner_type);
+
+	  stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
+	  stmt = convert (innerptrtype, stmt);
+	  stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
+	  stmt = build_indirect_ref (loc, stmt, RO_NULL);
+	  stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt,
+			 convert (TREE_TYPE (stmt), arg0));
+	  stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
+	}
+      return stmt;
+    }
+
+  unsigned int n;
+  for (n = 0;
+       !VOID_TYPE_P (TREE_VALUE (fnargs)) && n < nargs;
+       fnargs = TREE_CHAIN (fnargs), n++)
+    {
+      tree decl_type = TREE_VALUE (fnargs);
+      tree arg = (*arglist)[n];
+      tree type;
+
+      if (arg == error_mark_node)
+	return error_mark_node;
+
+      if (n >= MAX_OVLD_ARGS)
+	abort ();
+
+      arg = default_conversion (arg);
+
+      /* The C++ front-end converts float * to const void * using
+	 NOP_EXPR<const void *> (NOP_EXPR<void *> (x)).  */
+      type = TREE_TYPE (arg);
+      if (POINTER_TYPE_P (type)
+	  && TREE_CODE (arg) == NOP_EXPR
+	  && lang_hooks.types_compatible_p (TREE_TYPE (arg),
+					    const_ptr_type_node)
+	  && lang_hooks.types_compatible_p (TREE_TYPE (TREE_OPERAND (arg, 0)),
+					    ptr_type_node))
+	{
+	  arg = TREE_OPERAND (arg, 0);
+	  type = TREE_TYPE (arg);
+	}
+
+      /* Remove the const from the pointers to simplify the overload
+	 matching further down.  */
+      if (POINTER_TYPE_P (decl_type)
+	  && POINTER_TYPE_P (type)
+	  && TYPE_QUALS (TREE_TYPE (type)) != 0)
+	{
+	  if (TYPE_READONLY (TREE_TYPE (type))
+	      && !TYPE_READONLY (TREE_TYPE (decl_type)))
+	    warning (0, "passing argument %d of %qE discards const qualifier "
+		     "from pointer target type", n + 1, fndecl);
+	  type = build_qualified_type (TREE_TYPE (type), 0);
+	  type = build_pointer_type (type);
+	  arg = fold_convert (type, arg);
+	}
+
+      /* For RS6000_OVLD_VEC_LXVL, convert any const * to its non constant
+	 equivalent to simplify the overload matching below.  */
+      if (fcode == RS6000_OVLD_VEC_LXVL)
+	{
+	  if (POINTER_TYPE_P (type)
+	      && TYPE_READONLY (TREE_TYPE (type)))
+	    {
+	      type = build_qualified_type (TREE_TYPE (type), 0);
+	      type = build_pointer_type (type);
+	      arg = fold_convert (type, arg);
+	    }
+	}
+
+      args[n] = arg;
+      types[n] = type;
+    }
+
+  /* If the number of arguments did not match the prototype, return NULL
+     and the generic code will issue the appropriate error message.  */
+  if (!VOID_TYPE_P (TREE_VALUE (fnargs)) || n < nargs)
+    return NULL;
+
+  if (fcode == RS6000_OVLD_VEC_STEP)
+    {
+      if (TREE_CODE (types[0]) != VECTOR_TYPE)
+	goto bad;
+
+      return build_int_cst (NULL_TREE, TYPE_VECTOR_SUBPARTS (types[0]));
+    }
+
+  {
+    bool unsupported_builtin = false;
+    enum rs6000_gen_builtins overloaded_code;
+    bool supported = false;
+    ovlddata *instance = rs6000_overload_info[adj_fcode].first_instance;
+    gcc_assert (instance != NULL);
+
+    /* Need to special case __builtin_cmpb because the overloaded forms
+       of this function take (unsigned int, unsigned int) or (unsigned
+       long long int, unsigned long long int).  Since C conventions
+       allow the respective argument types to be implicitly coerced into
+       each other, the default handling does not provide adequate
+       discrimination between the desired forms of the function.  */
+    if (fcode == RS6000_OVLD_SCAL_CMPB)
+      {
+	machine_mode arg1_mode = TYPE_MODE (types[0]);
+	machine_mode arg2_mode = TYPE_MODE (types[1]);
+
+	if (nargs != 2)
+	  {
+	    error ("builtin %qs only accepts 2 arguments", "__builtin_cmpb");
+	    return error_mark_node;
+	  }
+
+	/* If any supplied arguments are wider than 32 bits, resolve to
+	   64-bit variant of built-in function.  */
+	if (GET_MODE_PRECISION (arg1_mode) > 32
+	    || GET_MODE_PRECISION (arg2_mode) > 32)
+	  /* Assure all argument and result types are compatible with
+	     the built-in function represented by RS6000_BIF_CMPB.  */
+	  overloaded_code = RS6000_BIF_CMPB;
+	else
+	  /* Assure all argument and result types are compatible with
+	     the built-in function represented by RS6000_BIF_CMPB_32.  */
+	  overloaded_code = RS6000_BIF_CMPB_32;
+
+	while (instance && instance->bifid != overloaded_code)
+	  instance = instance->next;
+
+	gcc_assert (instance != NULL);
+	tree fntype = rs6000_builtin_info_x[instance->bifid].fntype;
+	tree parmtype0 = TREE_VALUE (TYPE_ARG_TYPES (fntype));
+	tree parmtype1 = TREE_VALUE (TREE_CHAIN (TYPE_ARG_TYPES (fntype)));
+
+	if (rs6000_new_builtin_type_compatible (types[0], parmtype0)
+	    && rs6000_new_builtin_type_compatible (types[1], parmtype1))
+	  {
+	    if (rs6000_builtin_decl (instance->bifid, false) != error_mark_node
+		&& rs6000_new_builtin_is_supported (instance->bifid))
+	      {
+		tree ret_type = TREE_TYPE (instance->fntype);
+		return altivec_build_new_resolved_builtin (args, n, fntype,
+							   ret_type,
+							   instance->bifid,
+							   fcode);
+	      }
+	    else
+	      unsupported_builtin = true;
+	  }
+      }
+    else if (fcode == RS6000_OVLD_VEC_VSIE)
+      {
+	machine_mode arg1_mode = TYPE_MODE (types[0]);
+
+	if (nargs != 2)
+	  {
+	    error ("builtin %qs only accepts 2 arguments",
+		   "scalar_insert_exp");
+	    return error_mark_node;
+	  }
+
+	/* If supplied first argument is wider than 64 bits, resolve to
+	   128-bit variant of built-in function.  */
+	if (GET_MODE_PRECISION (arg1_mode) > 64)
+	  {
+	    /* If first argument is of float variety, choose variant
+	       that expects __ieee128 argument.  Otherwise, expect
+	       __int128 argument.  */
+	    if (GET_MODE_CLASS (arg1_mode) == MODE_FLOAT)
+	      overloaded_code = RS6000_BIF_VSIEQPF;
+	    else
+	      overloaded_code = RS6000_BIF_VSIEQP;
+	  }
+	else
+	  {
+	    /* If first argument is of float variety, choose variant
+	       that expects double argument.  Otherwise, expect
+	       long long int argument.  */
+	    if (GET_MODE_CLASS (arg1_mode) == MODE_FLOAT)
+	      overloaded_code = RS6000_BIF_VSIEDPF;
+	    else
+	      overloaded_code = RS6000_BIF_VSIEDP;
+	  }
+
+	while (instance && instance->bifid != overloaded_code)
+	  instance = instance->next;
+
+	gcc_assert (instance != NULL);
+	tree fntype = rs6000_builtin_info_x[instance->bifid].fntype;
+	tree parmtype0 = TREE_VALUE (TYPE_ARG_TYPES (fntype));
+	tree parmtype1 = TREE_VALUE (TREE_CHAIN (TYPE_ARG_TYPES (fntype)));
+
+	if (rs6000_new_builtin_type_compatible (types[0], parmtype0)
+	    && rs6000_new_builtin_type_compatible (types[1], parmtype1))
+	  {
+	    if (rs6000_builtin_decl (instance->bifid, false) != error_mark_node
+		&& rs6000_new_builtin_is_supported (instance->bifid))
+	      {
+		tree ret_type = TREE_TYPE (instance->fntype);
+		return altivec_build_new_resolved_builtin (args, n, fntype,
+							   ret_type,
+							   instance->bifid,
+							   fcode);
+	      }
+	    else
+	      unsupported_builtin = true;
+	  }
+      }
+    else
+      {
+	/* Functions with no arguments can have only one overloaded
+	   instance.  */
+	gcc_assert (n > 0 || !instance->next);
+
+	for (; instance != NULL; instance = instance->next)
+	  {
+	    bool mismatch = false;
+	    tree nextparm = TYPE_ARG_TYPES (instance->fntype);
+
+	    for (unsigned int arg_i = 0;
+		 arg_i < nargs && nextparm != NULL;
+		 arg_i++)
+	      {
+		tree parmtype = TREE_VALUE (nextparm);
+		if (!rs6000_new_builtin_type_compatible (types[arg_i],
+							 parmtype))
+		  {
+		    mismatch = true;
+		    break;
+		  }
+		nextparm = TREE_CHAIN (nextparm);
+	      }
+
+	    if (mismatch)
+	      continue;
+
+	    supported = rs6000_new_builtin_is_supported (instance->bifid);
+	    if (rs6000_builtin_decl (instance->bifid, false) != error_mark_node
+		&& supported)
+	      {
+		tree fntype = rs6000_builtin_info_x[instance->bifid].fntype;
+		tree ret_type = TREE_TYPE (instance->fntype);
+		return altivec_build_new_resolved_builtin (args, n, fntype,
+							   ret_type,
+							   instance->bifid,
+							   fcode);
+	      }
+	    else
+	      {
+		unsupported_builtin = true;
+		break;
+	      }
+	  }
+      }
+
+    if (unsupported_builtin)
+      {
+	const char *name = rs6000_overload_info[adj_fcode].ovld_name;
+	if (!supported)
+	  {
+	    const char *internal_name
+	      = rs6000_builtin_info_x[instance->bifid].bifname;
+	    /* An error message making reference to the name of the
+	       non-overloaded function has already been issued.  Add
+	       clarification of the previous message.  */
+	    rich_location richloc (line_table, input_location);
+	    inform (&richloc, "builtin %qs requires builtin %qs",
+		    name, internal_name);
+	  }
+	else
+	  error ("%qs is not supported in this compiler configuration", name);
+
+	return error_mark_node;
+      }
+  }
+ bad:
+  {
+    const char *name = rs6000_overload_info[adj_fcode].ovld_name;
+    error ("invalid parameter combination for AltiVec intrinsic %qs", name);
+    return error_mark_node;
+  }
+}
diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index e8625d1..a55cb7c 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -12971,6 +12971,59 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   return false;
 }
 
+/* Check whether a builtin function is supported in this target
+   configuration.  */
+bool
+rs6000_new_builtin_is_supported (enum rs6000_gen_builtins fncode)
+{
+  switch (rs6000_builtin_info_x[(size_t) fncode].enable)
+    {
+    case ENB_ALWAYS:
+      return true;
+    case ENB_P5:
+      return TARGET_POPCNTB;
+    case ENB_P6:
+      return TARGET_CMPB;
+    case ENB_P7:
+      return TARGET_POPCNTD;
+    case ENB_P7_64:
+      return TARGET_POPCNTD && TARGET_POWERPC64;
+    case ENB_P8:
+      return TARGET_DIRECT_MOVE;
+    case ENB_P8V:
+      return TARGET_P8_VECTOR;
+    case ENB_P9:
+      return TARGET_MODULO;
+    case ENB_P9_64:
+      return TARGET_MODULO && TARGET_POWERPC64;
+    case ENB_P9V:
+      return TARGET_P9_VECTOR;
+    case ENB_P10:
+      return TARGET_POWER10;
+    case ENB_P10_64:
+      return TARGET_POWER10 && TARGET_POWERPC64;
+    case ENB_ALTIVEC:
+      return TARGET_ALTIVEC;
+    case ENB_VSX:
+      return TARGET_VSX;
+    case ENB_CELL:
+      return TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL;
+    case ENB_IEEE128_HW:
+      return TARGET_FLOAT128_HW;
+    case ENB_DFP:
+      return TARGET_DFP;
+    case ENB_CRYPTO:
+      return TARGET_CRYPTO;
+    case ENB_HTM:
+      return TARGET_HTM;
+    case ENB_MMA:
+      return TARGET_MMA;
+    default:
+      gcc_unreachable ();
+    }
+  gcc_unreachable ();
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient
    (and in mode MODE if that's convenient).
diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c
index f3d6156..f65932e 100644
--- a/gcc/config/rs6000/rs6000-gen-builtins.c
+++ b/gcc/config/rs6000/rs6000-gen-builtins.c
@@ -2314,7 +2314,7 @@ write_decls (void)
 
   fprintf (header_file, "extern void rs6000_init_generated_builtins ();\n\n");
   fprintf (header_file,
-	   "extern bool rs6000_new_builtin_is_supported_p "
+	   "extern bool rs6000_new_builtin_is_supported "
 	   "(rs6000_gen_builtins);\n");
   fprintf (header_file,
 	   "extern tree rs6000_builtin_decl (unsigned, "
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 2570937..ad81dfb 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -21728,7 +21728,8 @@ rs6000_xcoff_encode_section_info (tree decl, rtx rtl, int first)
   if (decl
       && DECL_P (decl)
       && VAR_OR_FUNCTION_DECL_P (decl)
-      && symtab_node::get (decl)->alias == 0
+      && (symtab_node::get (decl) == NULL
+	  || symtab_node::get (decl)->alias == 0)
       && symname[strlen (symname) - 1] != ']')
     {
       const char *smclass = NULL;
@@ -22174,7 +22175,7 @@ rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code,
       break;
 
     case UNSPEC:
-      if (XINT (x, 1) == UNSPEC_MMA_XXSETACCZ)
+      if (XINT (x, 1) == UNSPECV_MMA_XXSETACCZ)
 	{
 	  *total = 0;
 	  return true;
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 3753de1..c1cb9ab 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -561,10 +561,6 @@ mpower9-minmax
 Target Undocumented Mask(P9_MINMAX) Var(rs6000_isa_flags)
 Use the new min/max instructions defined in ISA 3.0.
 
-mtoc-fusion
-Target Undocumented Mask(TOC_FUSION) Var(rs6000_isa_flags)
-Fuse medium/large code model toc references with the memory instruction.
-
 mmodulo
 Target Undocumented Mask(MODULO) Var(rs6000_isa_flags)
 Generate the integer modulo instructions.
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index 92766d8..d48a4b1 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -44,15 +44,11 @@ rs6000-logue.o: $(srcdir)/config/rs6000/rs6000-logue.c
 	$(COMPILE) $<
 	$(POSTCOMPILE)
 
-rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.c
-	$(COMPILE) $<
-	$(POSTCOMPILE)
-
-rbtree.o: $(srcdir)/config/rs6000/rbtree.c
-	$(COMPILE) $<
-	$(POSTCOMPILE)
+build/rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.c
+build/rbtree.o: $(srcdir)/config/rs6000/rbtree.c
 
-rs6000-gen-builtins: rs6000-gen-builtins.o rbtree.o
+build/rs6000-gen-builtins$(build_exeext): build/rs6000-gen-builtins.o \
+					  build/rbtree.o $(BUILD_LIBDEPS)
 	$(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ \
 	    $(filter-out $(BUILD_LIBDEPS), $^) $(BUILD_LIBS)
 
@@ -62,10 +58,11 @@ rs6000-gen-builtins: rs6000-gen-builtins.o rbtree.o
 #       <recipe>
 # For now, the header files depend on rs6000-builtins.c, which avoids
 # races because the .c file is closed last in rs6000-gen-builtins.c.
-rs6000-builtins.c: rs6000-gen-builtins \
+rs6000-builtins.c: build/rs6000-gen-builtins$(build_exeext) \
 		   $(srcdir)/config/rs6000/rs6000-builtin-new.def \
 		   $(srcdir)/config/rs6000/rs6000-overload.def
-	./rs6000-gen-builtins $(srcdir)/config/rs6000/rs6000-builtin-new.def \
+	$(RUN_GEN) ./build/rs6000-gen-builtins$(build_exeext) \
+		$(srcdir)/config/rs6000/rs6000-builtin-new.def \
 		$(srcdir)/config/rs6000/rs6000-overload.def rs6000-builtins.h \
 		rs6000-builtins.c rs6000-vecdefines.h
 
diff --git a/gcc/config/sparc/leon5.md b/gcc/config/sparc/leon5.md
new file mode 100644
index 0000000..6a065b1
--- /dev/null
+++ b/gcc/config/sparc/leon5.md
@@ -0,0 +1,103 @@
+;; Scheduling description for LEON5.
+;;   Copyright (C) 2021 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+
+;; The LEON5 can often dual issue instructions from the same 64-bit aligned
+;; double word if there are no data dependencies.
+;;
+;; Avoid scheduling load/store, FPU, and multiply instructions back to
+;; back, regardless of data dependencies.
+;;
+;; Push comparisons away from the associated branch instruction.
+;;
+;; Avoid scheduling ALU instructions with data dependencies back to back.
+;;
+;; Schedule three instructions between load and dependent instruction.
+
+(define_automaton "leon5")
+
+(define_cpu_unit "leon5_memory" "leon5")
+(define_cpu_unit "leon5_mul" "leon5")
+(define_cpu_unit "grfpu_d" "grfpu")
+(define_cpu_unit "grfpu_s" "grfpu")
+
+(define_insn_reservation "leon5_load" 4
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "load,sload"))
+  "leon5_memory * 2, nothing * 2")
+
+(define_insn_reservation "leon5_fpload" 2
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "fpload"))
+  "leon5_memory * 2 + grfpu_alu * 2")
+
+(define_insn_reservation "leon5_store" 2
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "store"))
+  "leon5_memory * 2")
+
+(define_insn_reservation "leon5_fpstore" 2
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "fpstore"))
+  "leon5_memory * 2 + grfpu_alu * 2")
+
+(define_insn_reservation "leon5_ialu" 2
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "ialu, shift, ialuX"))
+  "nothing * 2")
+
+(define_insn_reservation "leon5_compare" 5
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "compare"))
+  "nothing * 5")
+
+(define_insn_reservation "leon5_imul" 4
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "imul"))
+  "leon5_mul * 2, nothing * 2")
+
+(define_insn_reservation "leon5_idiv" 35
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "imul"))
+  "nothing * 35")
+
+(define_insn_reservation "leon5_fp_alu" 5
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "fp,fpcmp,fpmul,fpmove"))
+  "grfpu_alu * 2, nothing*3")
+
+(define_insn_reservation "leon5_fp_divs" 17
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "fpdivs"))
+  "grfpu_alu * 2 + grfpu_d*16, nothing")
+
+(define_insn_reservation "leon5_fp_divd" 18
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "fpdivd"))
+  "grfpu_alu * 2 + grfpu_d*17, nothing")
+
+(define_insn_reservation "leon5_fp_sqrts" 25
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "fpsqrts"))
+  "grfpu_alu * 2 + grfpu_s*24, nothing")
+
+(define_insn_reservation "leon5_fp_sqrtd" 26
+  (and (eq_attr "cpu" "leon5")
+  (eq_attr "type" "fpsqrtd"))
+  "grfpu_alu * 2 + grfpu_s*25, nothing")
diff --git a/gcc/config/sparc/sparc-opts.h b/gcc/config/sparc/sparc-opts.h
index 1af556e..9299cf6 100644
--- a/gcc/config/sparc/sparc-opts.h
+++ b/gcc/config/sparc/sparc-opts.h
@@ -31,6 +31,7 @@ enum sparc_processor_type {
   PROCESSOR_HYPERSPARC,
   PROCESSOR_LEON,
   PROCESSOR_LEON3,
+  PROCESSOR_LEON5,
   PROCESSOR_LEON3V7,
   PROCESSOR_SPARCLITE,
   PROCESSOR_F930,
diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
index 06f41d7..6bc6f0a 100644
--- a/gcc/config/sparc/sparc.c
+++ b/gcc/config/sparc/sparc.c
@@ -270,6 +270,31 @@ struct processor_costs leon3_costs = {
 };
 
 static const
+struct processor_costs leon5_costs = {
+  COSTS_N_INSNS (1), /* int load */
+  COSTS_N_INSNS (1), /* int signed load */
+  COSTS_N_INSNS (1), /* int zeroed load */
+  COSTS_N_INSNS (1), /* float load */
+  COSTS_N_INSNS (1), /* fmov, fneg, fabs */
+  COSTS_N_INSNS (1), /* fadd, fsub */
+  COSTS_N_INSNS (1), /* fcmp */
+  COSTS_N_INSNS (1), /* fmov, fmovr */
+  COSTS_N_INSNS (1), /* fmul */
+  COSTS_N_INSNS (17), /* fdivs */
+  COSTS_N_INSNS (18), /* fdivd */
+  COSTS_N_INSNS (25), /* fsqrts */
+  COSTS_N_INSNS (26), /* fsqrtd */
+  COSTS_N_INSNS (4), /* imul */
+  COSTS_N_INSNS (4), /* imulX */
+  0, /* imul bit factor */
+  COSTS_N_INSNS (35), /* idiv */
+  COSTS_N_INSNS (35), /* idivX */
+  COSTS_N_INSNS (1), /* movcc/movr */
+  0, /* shift penalty */
+  3 /* branch cost */
+};
+
+static const
 struct processor_costs sparclet_costs = {
   COSTS_N_INSNS (3), /* int load */
   COSTS_N_INSNS (3), /* int signed load */
@@ -575,6 +600,7 @@ static int function_arg_slotno (const CUMULATIVE_ARGS *, machine_mode,
 
 static int supersparc_adjust_cost (rtx_insn *, int, rtx_insn *, int);
 static int hypersparc_adjust_cost (rtx_insn *, int, rtx_insn *, int);
+static int leon5_adjust_cost (rtx_insn *, int, rtx_insn *, int);
 
 static void sparc_emit_set_const32 (rtx, rtx);
 static void sparc_emit_set_const64 (rtx, rtx);
@@ -1045,6 +1071,43 @@ atomic_insn_for_leon3_p (rtx_insn *insn)
     }
 }
 
+/* True if INSN is a store instruction.  */
+
+static bool
+store_insn_p (rtx_insn *insn)
+{
+  if (GET_CODE (PATTERN (insn)) != SET)
+    return false;
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_STORE:
+    case TYPE_FPSTORE:
+      return true;
+    default:
+      return false;
+    }
+}
+
+/* True if INSN is a load instruction.  */
+
+static bool
+load_insn_p (rtx_insn *insn)
+{
+  if (GET_CODE (PATTERN (insn)) != SET)
+    return false;
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LOAD:
+    case TYPE_SLOAD:
+    case TYPE_FPLOAD:
+      return true;
+    default:
+      return false;
+    }
+}
+
 /* We use a machine specific pass to enable workarounds for errata.
 
    We need to have the (essentially) final form of the insn stream in order
@@ -1057,10 +1120,29 @@ atomic_insn_for_leon3_p (rtx_insn *insn)
    && GET_CODE (PATTERN (INSN)) != USE					\
    && GET_CODE (PATTERN (INSN)) != CLOBBER)
 
+rtx_insn *
+next_active_non_empty_insn (rtx_insn *insn)
+{
+  insn = next_active_insn (insn);
+
+  while (insn
+	 && (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+	     || GET_CODE (PATTERN (insn)) == ASM_INPUT
+	     || (USEFUL_INSN_P (insn)
+		 && (asm_noperands (PATTERN (insn)) >= 0)
+		 && !strcmp (decode_asm_operands (PATTERN (insn),
+						  NULL, NULL, NULL,
+						  NULL, NULL), ""))))
+    insn = next_active_insn (insn);
+
+  return insn;
+}
+
 static unsigned int
 sparc_do_work_around_errata (void)
 {
   rtx_insn *insn, *next;
+  bool find_first_useful = true;
 
   /* Force all instructions to be split into their final form.  */
   split_all_insns_noflow ();
@@ -1085,6 +1167,16 @@ sparc_do_work_around_errata (void)
       else
 	jump = NULL;
 
+      /* Do not begin function with atomic instruction.  */
+      if (sparc_fix_ut700
+	  && find_first_useful
+	  && USEFUL_INSN_P (insn))
+	{
+	  find_first_useful = false;
+	  if (atomic_insn_for_leon3_p (insn))
+	    emit_insn_before (gen_nop (), insn);
+	}
+
       /* Place a NOP at the branch target of an integer branch if it is a
 	 floating-point operation or a floating-point branch.  */
       if (sparc_fix_gr712rc
@@ -1105,9 +1197,7 @@ sparc_do_work_around_errata (void)
 	 instruction at branch target.  */
       if (sparc_fix_ut700
 	  && NONJUMP_INSN_P (insn)
-	  && (set = single_set (insn)) != NULL_RTX
-	  && mem_ref (SET_SRC (set))
-	  && REG_P (SET_DEST (set)))
+	  && load_insn_p (insn))
 	{
 	  if (jump && jump_to_label_p (jump))
 	    {
@@ -1116,7 +1206,7 @@ sparc_do_work_around_errata (void)
 		emit_insn_before (gen_nop (), target);
 	    }
 
-	  next = next_active_insn (insn);
+	  next = next_active_non_empty_insn (insn);
 	  if (!next)
 	    break;
 
@@ -1212,30 +1302,19 @@ sparc_do_work_around_errata (void)
       if (sparc_fix_b2bst
 	  && NONJUMP_INSN_P (insn)
 	  && (set = single_set (insn)) != NULL_RTX
-	  && MEM_P (SET_DEST (set)))
+	  && store_insn_p (insn))
 	{
 	  /* Sequence B begins with a double-word store.  */
 	  bool seq_b = GET_MODE_SIZE (GET_MODE (SET_DEST (set))) == 8;
 	  rtx_insn *after;
 	  int i;
 
-	  next = next_active_insn (insn);
+	  next = next_active_non_empty_insn (insn);
 	  if (!next)
 	    break;
 
 	  for (after = next, i = 0; i < 2; i++)
 	    {
-	      /* Skip empty assembly statements.  */
-	      if ((GET_CODE (PATTERN (after)) == UNSPEC_VOLATILE)
-		  || (USEFUL_INSN_P (after)
-		      && (asm_noperands (PATTERN (after))>=0)
-		      && !strcmp (decode_asm_operands (PATTERN (after),
-						       NULL, NULL, NULL,
-						       NULL, NULL), "")))
-		after = next_active_insn (after);
-	      if (!after)
-		break;
-
 	      /* If the insn is a branch, then it cannot be problematic.  */
 	      if (!NONJUMP_INSN_P (after)
 		  || GET_CODE (PATTERN (after)) == SEQUENCE)
@@ -1245,8 +1324,7 @@ sparc_do_work_around_errata (void)
 	      if (seq_b)
 		{
 		  /* Add NOP if followed by a store.  */
-		  if ((set = single_set (after)) != NULL_RTX
-		      && MEM_P (SET_DEST (set)))
+		  if (store_insn_p (after))
 		    insert_nop = true;
 
 		  /* Otherwise it is ok.  */
@@ -1261,15 +1339,14 @@ sparc_do_work_around_errata (void)
 		      && (MEM_P (SET_DEST (set)) || mem_ref (SET_SRC (set))))
 		    break;
 
-		  after = next_active_insn (after);
+		  after = next_active_non_empty_insn (after);
 		  if (!after)
 		    break;
 		}
 
 	      /* Add NOP if third instruction is a store.  */
 	      if (i == 1
-		  && (set = single_set (after)) != NULL_RTX
-		  && MEM_P (SET_DEST (set)))
+		  && store_insn_p (after))
 		insert_nop = true;
 	    }
 	}
@@ -1596,6 +1673,10 @@ dump_target_flag_bits (const int flags)
     fprintf (stderr, "CBCOND ");
   if (flags & MASK_DEPRECATED_V8_INSNS)
     fprintf (stderr, "DEPRECATED_V8_INSNS ");
+  if (flags & MASK_LEON)
+    fprintf (stderr, "LEON ");
+  if (flags & MASK_LEON3)
+    fprintf (stderr, "LEON3 ");
   if (flags & MASK_SPARCLET)
     fprintf (stderr, "SPARCLET ");
   if (flags & MASK_SPARCLITE)
@@ -1632,6 +1713,7 @@ sparc_option_override (void)
     { TARGET_CPU_hypersparc, PROCESSOR_HYPERSPARC },
     { TARGET_CPU_leon, PROCESSOR_LEON },
     { TARGET_CPU_leon3, PROCESSOR_LEON3 },
+    { TARGET_CPU_leon5, PROCESSOR_LEON5 },
     { TARGET_CPU_leon3v7, PROCESSOR_LEON3V7 },
     { TARGET_CPU_sparclite, PROCESSOR_F930 },
     { TARGET_CPU_sparclite86x, PROCESSOR_SPARCLITE86X },
@@ -1663,6 +1745,7 @@ sparc_option_override (void)
     { "hypersparc",	MASK_ISA, MASK_V8 },
     { "leon",		MASK_ISA|MASK_FSMULD, MASK_V8|MASK_LEON },
     { "leon3",		MASK_ISA, MASK_V8|MASK_LEON3 },
+    { "leon5",		MASK_ISA, MASK_V8|MASK_LEON3 },
     { "leon3v7",	MASK_ISA, MASK_LEON3 },
     { "sparclite",	MASK_ISA, MASK_SPARCLITE },
     /* The Fujitsu MB86930 is the original sparclite chip, with no FPU.  */
@@ -1973,6 +2056,9 @@ sparc_option_override (void)
     case PROCESSOR_LEON3V7:
       sparc_costs = &leon3_costs;
       break;
+    case PROCESSOR_LEON5:
+      sparc_costs = &leon5_costs;
+      break;
     case PROCESSOR_SPARCLET:
     case PROCESSOR_TSC701:
       sparc_costs = &sparclet_costs;
@@ -10120,11 +10206,64 @@ hypersparc_adjust_cost (rtx_insn *insn, int dtype, rtx_insn *dep_insn,
 }
 
 static int
+leon5_adjust_cost (rtx_insn *insn, int dtype, rtx_insn *dep_insn,
+		   int cost)
+{
+  enum attr_type insn_type, dep_type;
+  rtx pat = PATTERN (insn);
+  rtx dep_pat = PATTERN (dep_insn);
+
+  if (recog_memoized (insn) < 0 || recog_memoized (dep_insn) < 0)
+    return cost;
+
+  insn_type = get_attr_type (insn);
+  dep_type = get_attr_type (dep_insn);
+
+  switch (dtype)
+    {
+    case REG_DEP_TRUE:
+      /* Data dependency; DEP_INSN writes a register that INSN reads some
+	 cycles later.  */
+
+      switch (insn_type)
+	{
+	case TYPE_STORE:
+	  /* Try to schedule three instructions between the store and
+	     the ALU instruction that generated the data.  */
+	  if (dep_type == TYPE_IALU || dep_type == TYPE_SHIFT)
+	    {
+	      if (GET_CODE (pat) != SET || GET_CODE (dep_pat) != SET)
+		break;
+
+	      if (rtx_equal_p (SET_DEST (dep_pat), SET_SRC (pat)))
+		return 4;
+	    }
+	  break;
+	default:
+	  break;
+	}
+      break;
+    case REG_DEP_ANTI:
+      /* Penalize anti-dependencies for FPU instructions.  */
+      if (fpop_insn_p (insn) || insn_type == TYPE_FPLOAD)
+	return 4;
+      break;
+    default:
+      break;
+    }
+
+  return cost;
+}
+
+static int
 sparc_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep, int cost,
 		   unsigned int)
 {
   switch (sparc_cpu)
     {
+    case PROCESSOR_LEON5:
+      cost = leon5_adjust_cost (insn, dep_type, dep, cost);
+      break;
     case PROCESSOR_SUPERSPARC:
       cost = supersparc_adjust_cost (insn, dep_type, dep, cost);
       break;
diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h
index 4da5a06..edafa99 100644
--- a/gcc/config/sparc/sparc.h
+++ b/gcc/config/sparc/sparc.h
@@ -120,21 +120,22 @@ along with GCC; see the file COPYING3.  If not see
 #define TARGET_CPU_leon		4
 #define TARGET_CPU_leon3	5
 #define TARGET_CPU_leon3v7	6
-#define TARGET_CPU_sparclite	7
-#define TARGET_CPU_f930		7       /* alias */
-#define TARGET_CPU_f934		7       /* alias */
-#define TARGET_CPU_sparclite86x	8
-#define TARGET_CPU_sparclet	9
-#define TARGET_CPU_tsc701	9       /* alias */
-#define TARGET_CPU_v9		10	/* generic v9 implementation */
-#define TARGET_CPU_sparcv9	10	/* alias */
-#define TARGET_CPU_sparc64	10	/* alias */
-#define TARGET_CPU_ultrasparc	11
-#define TARGET_CPU_ultrasparc3	12
-#define TARGET_CPU_niagara	13
-#define TARGET_CPU_niagara2	14
-#define TARGET_CPU_niagara3	15
-#define TARGET_CPU_niagara4	16
+#define TARGET_CPU_leon5	7
+#define TARGET_CPU_sparclite	8
+#define TARGET_CPU_f930		8       /* alias */
+#define TARGET_CPU_f934		8       /* alias */
+#define TARGET_CPU_sparclite86x	9
+#define TARGET_CPU_sparclet	10
+#define TARGET_CPU_tsc701	10       /* alias */
+#define TARGET_CPU_v9		11	/* generic v9 implementation */
+#define TARGET_CPU_sparcv9	11	/* alias */
+#define TARGET_CPU_sparc64	11	/* alias */
+#define TARGET_CPU_ultrasparc	12
+#define TARGET_CPU_ultrasparc3	13
+#define TARGET_CPU_niagara	14
+#define TARGET_CPU_niagara2	15
+#define TARGET_CPU_niagara3	16
+#define TARGET_CPU_niagara4	17
 #define TARGET_CPU_niagara7	19
 #define TARGET_CPU_m8		20
 
@@ -229,7 +230,8 @@ along with GCC; see the file COPYING3.  If not see
 #endif
 
 #if TARGET_CPU_DEFAULT == TARGET_CPU_leon \
- || TARGET_CPU_DEFAULT == TARGET_CPU_leon3
+ || TARGET_CPU_DEFAULT == TARGET_CPU_leon3 \
+ || TARGET_CPU_DEFAULT == TARGET_CPU_leon5
 #define CPP_CPU32_DEFAULT_SPEC "-D__leon__ -D__sparc_v8__"
 #define ASM_CPU32_DEFAULT_SPEC AS_LEON_FLAG
 #endif
@@ -285,6 +287,7 @@ along with GCC; see the file COPYING3.  If not see
 %{mcpu=hypersparc:-D__hypersparc__ -D__sparc_v8__} \
 %{mcpu=leon:-D__leon__ -D__sparc_v8__} \
 %{mcpu=leon3:-D__leon__ -D__sparc_v8__} \
+%{mcpu=leon5:-D__leon__ -D__sparc_v8__} \
 %{mcpu=leon3v7:-D__leon__} \
 %{mcpu=v9:-D__sparc_v9__} \
 %{mcpu=ultrasparc:-D__sparc_v9__} \
@@ -337,6 +340,7 @@ along with GCC; see the file COPYING3.  If not see
 %{mcpu=hypersparc:-Av8} \
 %{mcpu=leon:" AS_LEON_FLAG "} \
 %{mcpu=leon3:" AS_LEON_FLAG "} \
+%{mcpu=leon5:" AS_LEON_FLAG "} \
 %{mcpu=leon3v7:" AS_LEONV7_FLAG "} \
 %{mv8plus:-Av8plus} \
 %{mcpu=v9:-Av9} \
diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md
index 24b76e0..294c918 100644
--- a/gcc/config/sparc/sparc.md
+++ b/gcc/config/sparc/sparc.md
@@ -233,6 +233,7 @@
    hypersparc,
    leon,
    leon3,
+   leon5,
    leon3v7,
    sparclite,
    f930,
@@ -638,6 +639,7 @@
 (include "supersparc.md")
 (include "hypersparc.md")
 (include "leon.md")
+(include "leon5.md")
 (include "sparclet.md")
 (include "ultra1_2.md")
 (include "ultra3.md")
@@ -8353,9 +8355,15 @@ visl")
 	(unspec:SI [(match_operand:SI 1 "memory_operand" "m")] UNSPEC_SP_SET))
    (set (match_scratch:SI 2 "=&r") (const_int 0))]
   "TARGET_ARCH32"
-  "ld\t%1, %2\;st\t%2, %0\;mov\t0, %2"
+{
+  if (sparc_fix_b2bst)
+    return "ld\t%1, %2\;st\t%2, %0\;mov\t0, %2\;nop";
+  else
+    return "ld\t%1, %2\;st\t%2, %0\;mov\t0, %2";
+}
   [(set_attr "type" "multi")
-   (set_attr "length" "3")])
+   (set (attr "length") (if_then_else (eq_attr "fix_b2bst" "true")
+		      (const_int 4) (const_int 3)))])
 
 (define_insn "stack_protect_set64"
   [(set (match_operand:DI 0 "memory_operand" "=m")
diff --git a/gcc/config/sparc/sparc.opt b/gcc/config/sparc/sparc.opt
index fb79267..658a187 100644
--- a/gcc/config/sparc/sparc.opt
+++ b/gcc/config/sparc/sparc.opt
@@ -176,6 +176,9 @@ EnumValue
 Enum(sparc_processor) String(leon3v7) Value(PROCESSOR_LEON3V7)
 
 EnumValue
+Enum(sparc_processor) String(leon5) Value(PROCESSOR_LEON5)
+
+EnumValue
 Enum(sparc_processor) String(sparclite) Value(PROCESSOR_SPARCLITE)
 
 EnumValue
diff --git a/gcc/config/xtensa/t-xtensa b/gcc/config/xtensa/t-xtensa
index 973815c..d06e492 100644
--- a/gcc/config/xtensa/t-xtensa
+++ b/gcc/config/xtensa/t-xtensa
@@ -16,4 +16,5 @@
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
+TM_H += $(srcdir)/../include/xtensa-config.h
 $(out_object_file): gt-xtensa.h
author	Ian Lance Taylor <iant@golang.org>	2021-09-17 08:46:39 -0700
committer	Ian Lance Taylor <iant@golang.org>	2021-09-17 08:46:39 -0700
commit	a0791d0ed4f147ef347e83f4aedc7ad03f1a2008 (patch)
tree	7b3526910798e4cff7a7200d684383046bac6225 /gcc/config
parent	e252b51ccde010cbd2a146485d8045103cd99533 (diff)
parent	89be17a1b231ade643f28fbe616d53377e069da8 (diff)
download	gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.zip gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.tar.gz gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.tar.bz2