aboutsummaryrefslogtreecommitdiff
path: root/gcc/config
diff options
context:
space:
mode:
authorIan Lance Taylor <iant@golang.org>2021-09-17 08:46:39 -0700
committerIan Lance Taylor <iant@golang.org>2021-09-17 08:46:39 -0700
commita0791d0ed4f147ef347e83f4aedc7ad03f1a2008 (patch)
tree7b3526910798e4cff7a7200d684383046bac6225 /gcc/config
parente252b51ccde010cbd2a146485d8045103cd99533 (diff)
parent89be17a1b231ade643f28fbe616d53377e069da8 (diff)
downloadgcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.zip
gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.tar.gz
gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.tar.bz2
Merge from trunk revision 89be17a1b231ade643f28fbe616d53377e069da8.
Diffstat (limited to 'gcc/config')
-rw-r--r--gcc/config/arc/arc.md8
-rw-r--r--gcc/config/i386/avx512fp16intrin.h4434
-rw-r--r--gcc/config/i386/avx512fp16vlintrin.h2037
-rw-r--r--gcc/config/i386/i386-builtin-types.def56
-rw-r--r--gcc/config/i386/i386-builtin.def205
-rw-r--r--gcc/config/i386/i386-expand.c163
-rw-r--r--gcc/config/i386/i386-features.c25
-rw-r--r--gcc/config/i386/i386-modes.def2
-rw-r--r--gcc/config/i386/i386-options.c2
-rw-r--r--gcc/config/i386/i386-protos.h2
-rw-r--r--gcc/config/i386/i386.c56
-rw-r--r--gcc/config/i386/i386.h4
-rw-r--r--gcc/config/i386/i386.md9
-rw-r--r--gcc/config/i386/sse.md989
-rw-r--r--gcc/config/i386/subst.md1
-rw-r--r--gcc/config/i386/vxworks.h24
-rw-r--r--gcc/config/i386/x86-tune-costs.h124
-rw-r--r--gcc/config/i386/x86-tune-sched.c2
-rw-r--r--gcc/config/i386/x86-tune.def52
-rw-r--r--gcc/config/mips/netbsd.h2
-rw-r--r--gcc/config/rs6000/lynx.h1
-rw-r--r--gcc/config/rs6000/mma.md31
-rw-r--r--gcc/config/rs6000/rs6000-builtin-new.def9
-rw-r--r--gcc/config/rs6000/rs6000-c.c1078
-rw-r--r--gcc/config/rs6000/rs6000-call.c53
-rw-r--r--gcc/config/rs6000/rs6000-gen-builtins.c2
-rw-r--r--gcc/config/rs6000/rs6000.c5
-rw-r--r--gcc/config/rs6000/rs6000.opt4
-rw-r--r--gcc/config/rs6000/t-rs600017
-rw-r--r--gcc/config/sparc/leon5.md103
-rw-r--r--gcc/config/sparc/sparc-opts.h1
-rw-r--r--gcc/config/sparc/sparc.c183
-rw-r--r--gcc/config/sparc/sparc.h36
-rw-r--r--gcc/config/sparc/sparc.md12
-rw-r--r--gcc/config/sparc/sparc.opt3
-rw-r--r--gcc/config/xtensa/t-xtensa1
36 files changed, 8988 insertions, 748 deletions
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index 90ba85e..4919d27 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -4966,8 +4966,8 @@ core_3, archs4x, archs4xd, archs4xd_slow"
(const_int 1))
(label_ref (match_operand 1 "" ""))
(pc)))
- (set (match_dup 0) (plus (match_dup 0) (const_int -1)))
- (unspec [(const_int 0)] UNSPEC_ARC_LP)
+ (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))
+ (unspec:SI [(const_int 0)] UNSPEC_ARC_LP)
(clobber (match_dup 2))])]
""
{
@@ -4996,8 +4996,8 @@ core_3, archs4x, archs4xd, archs4xd_slow"
(const_int 1))
(label_ref (match_operand 1 "" ""))
(pc)))
- (set (match_dup 0) (plus (match_dup 0) (const_int -1)))
- (unspec [(const_int 0)] UNSPEC_ARC_LP)
+ (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))
+ (unspec:SI [(const_int 0)] UNSPEC_ARC_LP)
(clobber (match_scratch:SI 2 "=X,&r"))]
""
"@
diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h
index ed8ad84..a5041ed 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -192,6 +192,159 @@ _mm512_setzero_ph (void)
return _mm512_set1_ph (0.0f);
}
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_ph (void)
+{
+ __m128h __Y = __Y;
+ return __Y;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_ph (void)
+{
+ __m256h __Y = __Y;
+ return __Y;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_ph (void)
+{
+ __m512h __Y = __Y;
+ return __Y;
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_h (__m128h __A)
+{
+ return __A[0];
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsh_h (__m256h __A)
+{
+ return __A[0];
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsh_h (__m512h __A)
+{
+ return __A[0];
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_ps (__m512h __a)
+{
+ return (__m512) __a;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_pd (__m512h __a)
+{
+ return (__m512d) __a;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_si512 (__m512h __a)
+{
+ return (__m512i) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph512_ph128 (__m512h __A)
+{
+ union
+ {
+ __m128h a[4];
+ __m512h v;
+ } u = { .v = __A };
+ return u.a[0];
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph512_ph256 (__m512h __A)
+{
+ union
+ {
+ __m256h a[2];
+ __m512h v;
+ } u = { .v = __A };
+ return u.a[0];
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph128_ph512 (__m128h __A)
+{
+ union
+ {
+ __m128h a[4];
+ __m512h v;
+ } u;
+ u.a[0] = __A;
+ return u.v;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph256_ph512 (__m256h __A)
+{
+ union
+ {
+ __m256h a[2];
+ __m512h v;
+ } u;
+ u.a[0] = __A;
+ return u.v;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextph128_ph512 (__m128h __A)
+{
+ return (__m512h) _mm512_insertf32x4 (_mm512_setzero_ps (),
+ (__m128) __A, 0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextph256_ph512 (__m256h __A)
+{
+ return (__m512h) _mm512_insertf64x4 (_mm512_setzero_pd (),
+ (__m256d) __A, 0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps_ph (__m512 __a)
+{
+ return (__m512h) __a;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd_ph (__m512d __a)
+{
+ return (__m512h) __a;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_ph (__m512i __a)
+{
+ return (__m512h) __a;
+}
+
/* Create a vector with element 0 as F and the rest zero. */
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -229,15 +382,15 @@ extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
{
- return __builtin_ia32_vaddph_v32hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_addph512_mask (__C, __D, __A, __B);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C)
{
- return __builtin_ia32_vaddph_v32hf_mask (__B, __C,
- _mm512_setzero_ph (), __A);
+ return __builtin_ia32_addph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
}
extern __inline __m512h
@@ -251,15 +404,15 @@ extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
{
- return __builtin_ia32_vsubph_v32hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_subph512_mask (__C, __D, __A, __B);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C)
{
- return __builtin_ia32_vsubph_v32hf_mask (__B, __C,
- _mm512_setzero_ph (), __A);
+ return __builtin_ia32_subph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
}
extern __inline __m512h
@@ -273,15 +426,15 @@ extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
{
- return __builtin_ia32_vmulph_v32hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_mulph512_mask (__C, __D, __A, __B);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C)
{
- return __builtin_ia32_vmulph_v32hf_mask (__B, __C,
- _mm512_setzero_ph (), __A);
+ return __builtin_ia32_mulph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
}
extern __inline __m512h
@@ -295,15 +448,15 @@ extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
{
- return __builtin_ia32_vdivph_v32hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_divph512_mask (__C, __D, __A, __B);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C)
{
- return __builtin_ia32_vdivph_v32hf_mask (__B, __C,
- _mm512_setzero_ph (), __A);
+ return __builtin_ia32_divph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
}
#ifdef __OPTIMIZE__
@@ -311,9 +464,9 @@ extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_round_ph (__m512h __A, __m512h __B, const int __C)
{
- return __builtin_ia32_vaddph_v32hf_mask_round (__A, __B,
- _mm512_setzero_ph (),
- (__mmask32) -1, __C);
+ return __builtin_ia32_addph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
}
extern __inline __m512h
@@ -321,7 +474,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
__m512h __D, const int __E)
{
- return __builtin_ia32_vaddph_v32hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_addph512_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m512h
@@ -329,18 +482,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
const int __D)
{
- return __builtin_ia32_vaddph_v32hf_mask_round (__B, __C,
- _mm512_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_addph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C)
{
- return __builtin_ia32_vsubph_v32hf_mask_round (__A, __B,
- _mm512_setzero_ph (),
- (__mmask32) -1, __C);
+ return __builtin_ia32_subph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
}
extern __inline __m512h
@@ -348,7 +501,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
__m512h __D, const int __E)
{
- return __builtin_ia32_vsubph_v32hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_subph512_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m512h
@@ -356,18 +509,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
const int __D)
{
- return __builtin_ia32_vsubph_v32hf_mask_round (__B, __C,
- _mm512_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_subph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C)
{
- return __builtin_ia32_vmulph_v32hf_mask_round (__A, __B,
- _mm512_setzero_ph (),
- (__mmask32) -1, __C);
+ return __builtin_ia32_mulph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
}
extern __inline __m512h
@@ -375,7 +528,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
__m512h __D, const int __E)
{
- return __builtin_ia32_vmulph_v32hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_mulph512_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m512h
@@ -383,18 +536,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
const int __D)
{
- return __builtin_ia32_vmulph_v32hf_mask_round (__B, __C,
- _mm512_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_mulph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_div_round_ph (__m512h __A, __m512h __B, const int __C)
{
- return __builtin_ia32_vdivph_v32hf_mask_round (__A, __B,
- _mm512_setzero_ph (),
- (__mmask32) -1, __C);
+ return __builtin_ia32_divph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
}
extern __inline __m512h
@@ -402,7 +555,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
__m512h __D, const int __E)
{
- return __builtin_ia32_vdivph_v32hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_divph512_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m512h
@@ -410,67 +563,67 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
const int __D)
{
- return __builtin_ia32_vdivph_v32hf_mask_round (__B, __C,
- _mm512_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_divph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
}
#else
#define _mm512_add_round_ph(A, B, C) \
- ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((A), (B), \
- _mm512_setzero_ph (),\
- (__mmask32)-1, (C)))
+ ((__m512h)__builtin_ia32_addph512_mask_round((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
-#define _mm512_mask_add_round_ph(A, B, C, D, E) \
- ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((C), (D), (A), (B), (E)))
+#define _mm512_mask_add_round_ph(A, B, C, D, E) \
+ ((__m512h)__builtin_ia32_addph512_mask_round((C), (D), (A), (B), (E)))
#define _mm512_maskz_add_round_ph(A, B, C, D) \
- ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((B), (C), \
- _mm512_setzero_ph (),\
- (A), (D)))
+ ((__m512h)__builtin_ia32_addph512_mask_round((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
#define _mm512_sub_round_ph(A, B, C) \
- ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((A), (B), \
- _mm512_setzero_ph (),\
- (__mmask32)-1, (C)))
+ ((__m512h)__builtin_ia32_subph512_mask_round((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
-#define _mm512_mask_sub_round_ph(A, B, C, D, E) \
- ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((C), (D), (A), (B), (E)))
+#define _mm512_mask_sub_round_ph(A, B, C, D, E) \
+ ((__m512h)__builtin_ia32_subph512_mask_round((C), (D), (A), (B), (E)))
#define _mm512_maskz_sub_round_ph(A, B, C, D) \
- ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((B), (C), \
- _mm512_setzero_ph (),\
- (A), (D)))
+ ((__m512h)__builtin_ia32_subph512_mask_round((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
#define _mm512_mul_round_ph(A, B, C) \
- ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((A), (B), \
- _mm512_setzero_ph (),\
- (__mmask32)-1, (C)))
+ ((__m512h)__builtin_ia32_mulph512_mask_round((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
-#define _mm512_mask_mul_round_ph(A, B, C, D, E) \
- ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((C), (D), (A), (B), (E)))
+#define _mm512_mask_mul_round_ph(A, B, C, D, E) \
+ ((__m512h)__builtin_ia32_mulph512_mask_round((C), (D), (A), (B), (E)))
#define _mm512_maskz_mul_round_ph(A, B, C, D) \
- ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((B), (C), \
- _mm512_setzero_ph (),\
- (A), (D)))
+ ((__m512h)__builtin_ia32_mulph512_mask_round((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
#define _mm512_div_round_ph(A, B, C) \
- ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((A), (B), \
- _mm512_setzero_ph (),\
- (__mmask32)-1, (C)))
+ ((__m512h)__builtin_ia32_divph512_mask_round((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
-#define _mm512_mask_div_round_ph(A, B, C, D, E) \
- ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((C), (D), (A), (B), (E)))
+#define _mm512_mask_div_round_ph(A, B, C, D, E) \
+ ((__m512h)__builtin_ia32_divph512_mask_round((C), (D), (A), (B), (E)))
#define _mm512_maskz_div_round_ph(A, B, C, D) \
- ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((B), (C), \
- _mm512_setzero_ph (),\
- (A), (D)))
+ ((__m512h)__builtin_ia32_divph512_mask_round((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
#endif /* __OPTIMIZE__ */
/* Intrinsics of v[add,sub,mul,div]sh. */
extern __inline __m128h
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sh (__m128h __A, __m128h __B)
{
__A[0] += __B[0];
@@ -481,15 +634,15 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_add_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vaddsh_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_addsh_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_add_sh (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vaddsh_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_addsh_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
extern __inline __m128h
@@ -504,15 +657,15 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sub_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vsubsh_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_subsh_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sub_sh (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vsubsh_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_subsh_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
extern __inline __m128h
@@ -527,14 +680,14 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mul_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vmulsh_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_mulsh_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mul_sh (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vmulsh_v8hf_mask (__B, __C, _mm_setzero_ph (), __A);
+ return __builtin_ia32_mulsh_mask (__B, __C, _mm_setzero_ph (), __A);
}
extern __inline __m128h
@@ -549,15 +702,15 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_div_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vdivsh_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_divsh_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_div_sh (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vdivsh_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_divsh_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
#ifdef __OPTIMIZE__
@@ -565,9 +718,9 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_round_sh (__m128h __A, __m128h __B, const int __C)
{
- return __builtin_ia32_vaddsh_v8hf_mask_round (__A, __B,
- _mm_setzero_ph (),
- (__mmask8) -1, __C);
+ return __builtin_ia32_addsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
}
extern __inline __m128h
@@ -575,7 +728,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_add_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
__m128h __D, const int __E)
{
- return __builtin_ia32_vaddsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_addsh_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m128h
@@ -583,18 +736,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_add_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
const int __D)
{
- return __builtin_ia32_vaddsh_v8hf_mask_round (__B, __C,
- _mm_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_addsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_round_sh (__m128h __A, __m128h __B, const int __C)
{
- return __builtin_ia32_vsubsh_v8hf_mask_round (__A, __B,
- _mm_setzero_ph (),
- (__mmask8) -1, __C);
+ return __builtin_ia32_subsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
}
extern __inline __m128h
@@ -602,7 +755,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sub_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
__m128h __D, const int __E)
{
- return __builtin_ia32_vsubsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_subsh_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m128h
@@ -610,18 +763,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sub_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
const int __D)
{
- return __builtin_ia32_vsubsh_v8hf_mask_round (__B, __C,
- _mm_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_subsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_round_sh (__m128h __A, __m128h __B, const int __C)
{
- return __builtin_ia32_vmulsh_v8hf_mask_round (__A, __B,
- _mm_setzero_ph (),
- (__mmask8) -1, __C);
+ return __builtin_ia32_mulsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
}
extern __inline __m128h
@@ -629,7 +782,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mul_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
__m128h __D, const int __E)
{
- return __builtin_ia32_vmulsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_mulsh_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m128h
@@ -637,18 +790,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mul_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
const int __D)
{
- return __builtin_ia32_vmulsh_v8hf_mask_round (__B, __C,
- _mm_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_mulsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_round_sh (__m128h __A, __m128h __B, const int __C)
{
- return __builtin_ia32_vdivsh_v8hf_mask_round (__A, __B,
- _mm_setzero_ph (),
- (__mmask8) -1, __C);
+ return __builtin_ia32_divsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
}
extern __inline __m128h
@@ -656,7 +809,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_div_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
__m128h __D, const int __E)
{
- return __builtin_ia32_vdivsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_divsh_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m128h
@@ -664,62 +817,62 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_div_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
const int __D)
{
- return __builtin_ia32_vdivsh_v8hf_mask_round (__B, __C,
- _mm_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_divsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
}
#else
#define _mm_add_round_sh(A, B, C) \
- ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((A), (B), \
- _mm_setzero_ph (), \
- (__mmask8)-1, (C)))
+ ((__m128h)__builtin_ia32_addsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
#define _mm_mask_add_round_sh(A, B, C, D, E) \
- ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+ ((__m128h)__builtin_ia32_addsh_mask_round ((C), (D), (A), (B), (E)))
-#define _mm_maskz_add_round_sh(A, B, C, D) \
- ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((B), (C), \
- _mm_setzero_ph (), \
- (A), (D)))
+#define _mm_maskz_add_round_sh(A, B, C, D) \
+ ((__m128h)__builtin_ia32_addsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
#define _mm_sub_round_sh(A, B, C) \
- ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((A), (B), \
- _mm_setzero_ph (), \
- (__mmask8)-1, (C)))
+ ((__m128h)__builtin_ia32_subsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
#define _mm_mask_sub_round_sh(A, B, C, D, E) \
- ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+ ((__m128h)__builtin_ia32_subsh_mask_round ((C), (D), (A), (B), (E)))
-#define _mm_maskz_sub_round_sh(A, B, C, D) \
- ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((B), (C), \
- _mm_setzero_ph (), \
- (A), (D)))
+#define _mm_maskz_sub_round_sh(A, B, C, D) \
+ ((__m128h)__builtin_ia32_subsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
#define _mm_mul_round_sh(A, B, C) \
- ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((A), (B), \
- _mm_setzero_ph (), \
- (__mmask8)-1, (C)))
+ ((__m128h)__builtin_ia32_mulsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
#define _mm_mask_mul_round_sh(A, B, C, D, E) \
- ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+ ((__m128h)__builtin_ia32_mulsh_mask_round ((C), (D), (A), (B), (E)))
-#define _mm_maskz_mul_round_sh(A, B, C, D) \
- ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((B), (C), \
- _mm_setzero_ph (), \
- (A), (D)))
+#define _mm_maskz_mul_round_sh(A, B, C, D) \
+ ((__m128h)__builtin_ia32_mulsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
#define _mm_div_round_sh(A, B, C) \
- ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((A), (B), \
- _mm_setzero_ph (), \
- (__mmask8)-1, (C)))
+ ((__m128h)__builtin_ia32_divsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
#define _mm_mask_div_round_sh(A, B, C, D, E) \
- ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+ ((__m128h)__builtin_ia32_divsh_mask_round ((C), (D), (A), (B), (E)))
-#define _mm_maskz_div_round_sh(A, B, C, D) \
- ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((B), (C), \
- _mm_setzero_ph (), \
- (A), (D)))
+#define _mm_maskz_div_round_sh(A, B, C, D) \
+ ((__m128h)__builtin_ia32_divsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
#endif /* __OPTIMIZE__ */
/* Intrinsic vmaxph vminph. */
@@ -727,48 +880,48 @@ extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_ph (__m512h __A, __m512h __B)
{
- return __builtin_ia32_vmaxph_v32hf_mask (__A, __B,
- _mm512_setzero_ph (),
- (__mmask32) -1);
+ return __builtin_ia32_maxph512_mask (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
{
- return __builtin_ia32_vmaxph_v32hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_maxph512_mask (__C, __D, __A, __B);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_ph (__mmask32 __A, __m512h __B, __m512h __C)
{
- return __builtin_ia32_vmaxph_v32hf_mask (__B, __C,
- _mm512_setzero_ph (), __A);
+ return __builtin_ia32_maxph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_ph (__m512h __A, __m512h __B)
{
- return __builtin_ia32_vminph_v32hf_mask (__A, __B,
- _mm512_setzero_ph (),
- (__mmask32) -1);
+ return __builtin_ia32_minph512_mask (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
{
- return __builtin_ia32_vminph_v32hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_minph512_mask (__C, __D, __A, __B);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_ph (__mmask32 __A, __m512h __B, __m512h __C)
{
- return __builtin_ia32_vminph_v32hf_mask (__B, __C,
- _mm512_setzero_ph (), __A);
+ return __builtin_ia32_minph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
}
#ifdef __OPTIMIZE__
@@ -776,9 +929,9 @@ extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_max_round_ph (__m512h __A, __m512h __B, const int __C)
{
- return __builtin_ia32_vmaxph_v32hf_mask_round (__A, __B,
- _mm512_setzero_ph (),
- (__mmask32) -1, __C);
+ return __builtin_ia32_maxph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
}
extern __inline __m512h
@@ -786,7 +939,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_max_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
__m512h __D, const int __E)
{
- return __builtin_ia32_vmaxph_v32hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_maxph512_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m512h
@@ -794,18 +947,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_max_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
const int __D)
{
- return __builtin_ia32_vmaxph_v32hf_mask_round (__B, __C,
- _mm512_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_maxph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
}
extern __inline __m512h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_min_round_ph (__m512h __A, __m512h __B, const int __C)
{
- return __builtin_ia32_vminph_v32hf_mask_round (__A, __B,
- _mm512_setzero_ph (),
- (__mmask32) -1, __C);
+ return __builtin_ia32_minph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
}
extern __inline __m512h
@@ -813,7 +966,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_min_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
__m512h __D, const int __E)
{
- return __builtin_ia32_vminph_v32hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_minph512_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m512h
@@ -821,37 +974,37 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_min_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
const int __D)
{
- return __builtin_ia32_vminph_v32hf_mask_round (__B, __C,
- _mm512_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_minph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
}
#else
-#define _mm512_max_round_ph(A, B, C) \
- (__builtin_ia32_vmaxph_v32hf_mask_round ((A), (B), \
- _mm512_setzero_ph (), \
- (__mmask32)-1, (C)))
+#define _mm512_max_round_ph(A, B, C) \
+ (__builtin_ia32_maxph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
#define _mm512_mask_max_round_ph(A, B, C, D, E) \
- (__builtin_ia32_vmaxph_v32hf_mask_round ((C), (D), (A), (B), (E)))
+ (__builtin_ia32_maxph512_mask_round ((C), (D), (A), (B), (E)))
-#define _mm512_maskz_max_round_ph(A, B, C, D) \
- (__builtin_ia32_vmaxph_v32hf_mask_round ((B), (C), \
- _mm512_setzero_ph (), \
- (A), (D)))
+#define _mm512_maskz_max_round_ph(A, B, C, D) \
+ (__builtin_ia32_maxph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
-#define _mm512_min_round_ph(A, B, C) \
- (__builtin_ia32_vminph_v32hf_mask_round ((A), (B), \
- _mm512_setzero_ph (), \
- (__mmask32)-1, (C)))
+#define _mm512_min_round_ph(A, B, C) \
+ (__builtin_ia32_minph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
#define _mm512_mask_min_round_ph(A, B, C, D, E) \
- (__builtin_ia32_vminph_v32hf_mask_round ((C), (D), (A), (B), (E)))
+ (__builtin_ia32_minph512_mask_round ((C), (D), (A), (B), (E)))
-#define _mm512_maskz_min_round_ph(A, B, C, D) \
- (__builtin_ia32_vminph_v32hf_mask_round ((B), (C), \
- _mm512_setzero_ph (), \
- (A), (D)))
+#define _mm512_maskz_min_round_ph(A, B, C, D) \
+ (__builtin_ia32_minph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
#endif /* __OPTIMIZE__ */
/* Intrinsic vmaxsh vminsh. */
@@ -867,15 +1020,15 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vmaxsh_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_maxsh_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_sh (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vmaxsh_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_maxsh_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
extern __inline __m128h
@@ -890,15 +1043,15 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vminsh_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_minsh_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_sh (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vminsh_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_minsh_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
#ifdef __OPTIMIZE__
@@ -906,9 +1059,9 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_round_sh (__m128h __A, __m128h __B, const int __C)
{
- return __builtin_ia32_vmaxsh_v8hf_mask_round (__A, __B,
- _mm_setzero_ph (),
- (__mmask8) -1, __C);
+ return __builtin_ia32_maxsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
}
extern __inline __m128h
@@ -916,7 +1069,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
__m128h __D, const int __E)
{
- return __builtin_ia32_vmaxsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_maxsh_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m128h
@@ -924,18 +1077,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
const int __D)
{
- return __builtin_ia32_vmaxsh_v8hf_mask_round (__B, __C,
- _mm_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_maxsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_round_sh (__m128h __A, __m128h __B, const int __C)
{
- return __builtin_ia32_vminsh_v8hf_mask_round (__A, __B,
- _mm_setzero_ph (),
- (__mmask8) -1, __C);
+ return __builtin_ia32_minsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
}
extern __inline __m128h
@@ -943,7 +1096,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
__m128h __D, const int __E)
{
- return __builtin_ia32_vminsh_v8hf_mask_round (__C, __D, __A, __B, __E);
+ return __builtin_ia32_minsh_mask_round (__C, __D, __A, __B, __E);
}
extern __inline __m128h
@@ -951,37 +1104,37 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
const int __D)
{
- return __builtin_ia32_vminsh_v8hf_mask_round (__B, __C,
- _mm_setzero_ph (),
- __A, __D);
+ return __builtin_ia32_minsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
}
#else
-#define _mm_max_round_sh(A, B, C) \
- (__builtin_ia32_vmaxsh_v8hf_mask_round ((A), (B), \
- _mm_setzero_ph (), \
- (__mmask8)-1, (C)))
+#define _mm_max_round_sh(A, B, C) \
+ (__builtin_ia32_maxsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
-#define _mm_mask_max_round_sh(A, B, C, D, E) \
- (__builtin_ia32_vmaxsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+#define _mm_mask_max_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_maxsh_mask_round ((C), (D), (A), (B), (E)))
-#define _mm_maskz_max_round_sh(A, B, C, D) \
- (__builtin_ia32_vmaxsh_v8hf_mask_round ((B), (C), \
- _mm_setzero_ph (), \
- (A), (D)))
+#define _mm_maskz_max_round_sh(A, B, C, D) \
+ (__builtin_ia32_maxsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
-#define _mm_min_round_sh(A, B, C) \
- (__builtin_ia32_vminsh_v8hf_mask_round ((A), (B), \
- _mm_setzero_ph (), \
- (__mmask8)-1, (C)))
+#define _mm_min_round_sh(A, B, C) \
+ (__builtin_ia32_minsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
-#define _mm_mask_min_round_sh(A, B, C, D, E) \
- (__builtin_ia32_vminsh_v8hf_mask_round ((C), (D), (A), (B), (E)))
+#define _mm_mask_min_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_minsh_mask_round ((C), (D), (A), (B), (E)))
-#define _mm_maskz_min_round_sh(A, B, C, D) \
- (__builtin_ia32_vminsh_v8hf_mask_round ((B), (C), \
- _mm_setzero_ph (), \
- (A), (D)))
+#define _mm_maskz_min_round_sh(A, B, C, D) \
+ (__builtin_ia32_minsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
#endif /* __OPTIMIZE__ */
@@ -991,8 +1144,8 @@ extern __inline __mmask32
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_ph_mask (__m512h __A, __m512h __B, const int __C)
{
- return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask (__A, __B, __C,
- (__mmask32) -1);
+ return (__mmask32) __builtin_ia32_cmpph512_mask (__A, __B, __C,
+ (__mmask32) -1);
}
extern __inline __mmask32
@@ -1000,8 +1153,8 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_ph_mask (__mmask32 __A, __m512h __B, __m512h __C,
const int __D)
{
- return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask (__B, __C, __D,
- __A);
+ return (__mmask32) __builtin_ia32_cmpph512_mask (__B, __C, __D,
+ __A);
}
extern __inline __mmask32
@@ -1009,9 +1162,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cmp_round_ph_mask (__m512h __A, __m512h __B, const int __C,
const int __D)
{
- return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask_round (__A, __B,
- __C, (__mmask32) -1,
- __D);
+ return (__mmask32) __builtin_ia32_cmpph512_mask_round (__A, __B,
+ __C, (__mmask32) -1,
+ __D);
}
extern __inline __mmask32
@@ -1019,23 +1172,23 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cmp_round_ph_mask (__mmask32 __A, __m512h __B, __m512h __C,
const int __D, const int __E)
{
- return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask_round (__B, __C,
- __D, __A,
- __E);
+ return (__mmask32) __builtin_ia32_cmpph512_mask_round (__B, __C,
+ __D, __A,
+ __E);
}
#else
#define _mm512_cmp_ph_mask(A, B, C) \
- (__builtin_ia32_vcmpph_v32hf_mask ((A), (B), (C), (-1)))
+ (__builtin_ia32_cmpph512_mask ((A), (B), (C), (-1)))
#define _mm512_mask_cmp_ph_mask(A, B, C, D) \
- (__builtin_ia32_vcmpph_v32hf_mask ((B), (C), (D), (A)))
+ (__builtin_ia32_cmpph512_mask ((B), (C), (D), (A)))
-#define _mm512_cmp_round_ph_mask(A, B, C, D) \
- (__builtin_ia32_vcmpph_v32hf_mask_round ((A), (B), (C), (-1), (D)))
+#define _mm512_cmp_round_ph_mask(A, B, C, D) \
+ (__builtin_ia32_cmpph512_mask_round ((A), (B), (C), (-1), (D)))
-#define _mm512_mask_cmp_round_ph_mask(A, B, C, D, E) \
- (__builtin_ia32_vcmpph_v32hf_mask_round ((B), (C), (D), (A), (E)))
+#define _mm512_mask_cmp_round_ph_mask(A, B, C, D, E) \
+ (__builtin_ia32_cmpph512_mask_round ((B), (C), (D), (A), (E)))
#endif /* __OPTIMIZE__ */
@@ -1046,9 +1199,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_sh_mask (__m128h __A, __m128h __B, const int __C)
{
return (__mmask8)
- __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B,
- __C, (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ __builtin_ia32_cmpsh_mask_round (__A, __B,
+ __C, (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline __mmask8
@@ -1057,9 +1210,9 @@ _mm_mask_cmp_sh_mask (__mmask8 __A, __m128h __B, __m128h __C,
const int __D)
{
return (__mmask8)
- __builtin_ia32_vcmpsh_v8hf_mask_round (__B, __C,
- __D, __A,
- _MM_FROUND_CUR_DIRECTION);
+ __builtin_ia32_cmpsh_mask_round (__B, __C,
+ __D, __A,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline __mmask8
@@ -1067,9 +1220,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_round_sh_mask (__m128h __A, __m128h __B, const int __C,
const int __D)
{
- return (__mmask8) __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B,
- __C, (__mmask8) -1,
- __D);
+ return (__mmask8) __builtin_ia32_cmpsh_mask_round (__A, __B,
+ __C, (__mmask8) -1,
+ __D);
}
extern __inline __mmask8
@@ -1077,25 +1230,25 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_round_sh_mask (__mmask8 __A, __m128h __B, __m128h __C,
const int __D, const int __E)
{
- return (__mmask8) __builtin_ia32_vcmpsh_v8hf_mask_round (__B, __C,
- __D, __A,
- __E);
+ return (__mmask8) __builtin_ia32_cmpsh_mask_round (__B, __C,
+ __D, __A,
+ __E);
}
#else
-#define _mm_cmp_sh_mask(A, B, C) \
- (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (C), (-1), \
- (_MM_FROUND_CUR_DIRECTION)))
+#define _mm_cmp_sh_mask(A, B, C) \
+ (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1), \
+ (_MM_FROUND_CUR_DIRECTION)))
-#define _mm_mask_cmp_sh_mask(A, B, C, D) \
- (__builtin_ia32_vcmpsh_v8hf_mask_round ((B), (C), (D), (A), \
- (_MM_FROUND_CUR_DIRECTION)))
+#define _mm_mask_cmp_sh_mask(A, B, C, D) \
+ (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A), \
+ (_MM_FROUND_CUR_DIRECTION)))
-#define _mm_cmp_round_sh_mask(A, B, C, D) \
- (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (C), (-1), (D)))
+#define _mm_cmp_round_sh_mask(A, B, C, D) \
+ (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1), (D)))
-#define _mm_mask_cmp_round_sh_mask(A, B, C, D, E) \
- (__builtin_ia32_vcmpsh_v8hf_mask_round ((B), (C), (D), (A), (E)))
+#define _mm_mask_cmp_round_sh_mask(A, B, C, D, E) \
+ (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A), (E)))
#endif /* __OPTIMIZE__ */
@@ -1104,137 +1257,3792 @@ extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_EQ_OS,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LT_OS,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LE_OS,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GT_OS,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GE_OS,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_NEQ_US,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_US,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_EQ_OQ,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LT_OQ,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LE_OQ,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GT_OQ,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GE_OQ,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sh (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_NEQ_UQ,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_UQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
#ifdef __OPTIMIZE__
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
- _mm_comi_sh (__m128h __A, __m128h __B, const int __P)
+_mm_comi_sh (__m128h __A, __m128h __B, const int __P)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, __P,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, __P,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
}
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_comi_round_sh (__m128h __A, __m128h __B, const int __P, const int __R)
{
- return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, __P,
- (__mmask8) -1,__R);
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, __P,
+ (__mmask8) -1,__R);
}
#else
-#define _mm_comi_round_sh(A, B, P, R) \
- (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (P), (__mmask8) (-1), (R)))
-#define _mm_comi_sh(A, B, P) \
- (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (P), (__mmask8) (-1), \
- _MM_FROUND_CUR_DIRECTION))
+#define _mm_comi_round_sh(A, B, P, R) \
+ (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1), (R)))
+#define _mm_comi_sh(A, B, P) \
+ (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1), \
+ _MM_FROUND_CUR_DIRECTION))
#endif /* __OPTIMIZE__ */
+/* Intrinsics vsqrtph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_ph (__m512h __A)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__A,
+ _mm512_setzero_ph(),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_ph (__mmask32 __A, __m512h __B)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__B,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_round_ph (__m512h __A, const int __B)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__A,
+ _mm512_setzero_ph(),
+ (__mmask32) -1, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ const int __D)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_round_ph (__mmask32 __A, __m512h __B, const int __C)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__B,
+ _mm512_setzero_ph (),
+ __A, __C);
+}
+
+#else
+#define _mm512_sqrt_round_ph(A, B) \
+ (__builtin_ia32_sqrtph512_mask_round ((A), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (B)))
+
+#define _mm512_mask_sqrt_round_ph(A, B, C, D) \
+ (__builtin_ia32_sqrtph512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_sqrt_round_ph(A, B, C) \
+ (__builtin_ia32_sqrtph512_mask_round ((B), \
+ _mm512_setzero_ph (), \
+ (A), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrsqrtph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt_ph (__m512h __A)
+{
+ return __builtin_ia32_rsqrtph512_mask (__A, _mm512_setzero_ph (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+ return __builtin_ia32_rsqrtph512_mask (__C, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt_ph (__mmask32 __A, __m512h __B)
+{
+ return __builtin_ia32_rsqrtph512_mask (__B, _mm512_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vrsqrtsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_rsqrtsh_mask (__B, __A, _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_rsqrtsh_mask (__D, __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_rsqrtsh_mask (__C, __B, _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vsqrtsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B,
+ __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm_sqrt_round_sh(A, B, C) \
+ (__builtin_ia32_sqrtsh_mask_round ((B), (A), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
+
+#define _mm_mask_sqrt_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_sqrtsh_mask_round ((D), (C), (A), (B), (E)))
+
+#define _mm_maskz_sqrt_round_sh(A, B, C, D) \
+ (__builtin_ia32_sqrtsh_mask_round ((C), (B), \
+ _mm_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrcpph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp_ph (__m512h __A)
+{
+ return __builtin_ia32_rcpph512_mask (__A, _mm512_setzero_ph (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+ return __builtin_ia32_rcpph512_mask (__C, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp_ph (__mmask32 __A, __m512h __B)
+{
+ return __builtin_ia32_rcpph512_mask (__B, _mm512_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vrcpsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_rcpsh_mask (__B, __A, _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp_sh (__m128h __A, __mmask32 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_rcpsh_mask (__D, __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp_sh (__mmask32 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_rcpsh_mask (__C, __B, _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vscalefph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_ph (__m512h __A, __m512h __B)
+{
+ return __builtin_ia32_scalefph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+ return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+ return __builtin_ia32_scalefph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+ return __builtin_ia32_scalefph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B,
+ __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+ const int __D)
+{
+ return __builtin_ia32_scalefph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm512_scalef_round_ph(A, B, C) \
+ (__builtin_ia32_scalefph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_scalef_round_ph(A, B, C, D, E) \
+ (__builtin_ia32_scalefph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_scalef_round_ph(A, B, C, D) \
+ (__builtin_ia32_scalefph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vscalefsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_scalefsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_scalefsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+ return __builtin_ia32_scalefsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B,
+ __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return __builtin_ia32_scalefsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm_scalef_round_sh(A, B, C) \
+ (__builtin_ia32_scalefsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
+
+#define _mm_mask_scalef_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_scalefsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_scalef_round_sh(A, B, C, D) \
+ (__builtin_ia32_scalefsh_mask_round ((B), (C), _mm_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vreduceph. */
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_ph (__m512h __A, int __B)
+{
+ return __builtin_ia32_reduceph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_ph (__m512h __A, __mmask32 __B, __m512h __C, int __D)
+{
+ return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_ph (__mmask32 __A, __m512h __B, int __C)
+{
+ return __builtin_ia32_reduceph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_round_ph (__m512h __A, int __B, const int __C)
+{
+ return __builtin_ia32_reduceph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ int __D, const int __E)
+{
+ return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B,
+ __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_round_ph (__mmask32 __A, __m512h __B, int __C,
+ const int __D)
+{
+ return __builtin_ia32_reduceph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm512_reduce_ph(A, B) \
+ (__builtin_ia32_reduceph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_ph(A, B, C, D) \
+ (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_reduce_ph(A, B, C) \
+ (__builtin_ia32_reduceph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_reduce_round_ph(A, B, C) \
+ (__builtin_ia32_reduceph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_reduce_round_ph(A, B, C, D, E) \
+ (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_reduce_round_ph(A, B, C, D) \
+ (__builtin_ia32_reduceph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vreducesh. */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_sh (__m128h __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_reducesh_mask_round (__A, __B, __C,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, int __E)
+{
+ return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_reducesh_mask_round (__B, __C, __D,
+ _mm_setzero_ph (), __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_round_sh (__m128h __A, __m128h __B, int __C, const int __D)
+{
+ return __builtin_ia32_reducesh_mask_round (__A, __B, __C,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, int __E, const int __F)
+{
+ return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A,
+ __B, __F);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ int __D, const int __E)
+{
+ return __builtin_ia32_reducesh_mask_round (__B, __C, __D,
+ _mm_setzero_ph (),
+ __A, __E);
+}
+
+#else
+#define _mm_reduce_sh(A, B, C) \
+ (__builtin_ia32_reducesh_mask_round ((A), (B), (C), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_reduce_sh(A, B, C, D, E) \
+ (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_reduce_sh(A, B, C, D) \
+ (__builtin_ia32_reducesh_mask_round ((B), (C), (D), \
+ _mm_setzero_ph (), \
+ (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_reduce_round_sh(A, B, C, D) \
+ (__builtin_ia32_reducesh_mask_round ((A), (B), (C), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (D)))
+
+#define _mm_mask_reduce_round_sh(A, B, C, D, E, F) \
+ (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B), (F)))
+
+#define _mm_maskz_reduce_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_reducesh_mask_round ((B), (C), (D), \
+ _mm_setzero_ph (), \
+ (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscaleph. */
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_ph (__m512h __A, int __B)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_ph (__m512h __A, __mmask32 __B,
+ __m512h __C, int __D)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_ph (__mmask32 __A, __m512h __B, int __C)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_round_ph (__m512h __A, int __B, const int __C)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_round_ph (__m512h __A, __mmask32 __B,
+ __m512h __C, int __D, const int __E)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A,
+ __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_round_ph (__mmask32 __A, __m512h __B, int __C,
+ const int __D)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm512_roundscale_ph(A, B) \
+ (__builtin_ia32_rndscaleph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_ph(A, B, C, D) \
+ (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_roundscale_ph(A, B, C) \
+ (__builtin_ia32_rndscaleph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), \
+ _MM_FROUND_CUR_DIRECTION))
+#define _mm512_roundscale_round_ph(A, B, C) \
+ (__builtin_ia32_rndscaleph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_roundscale_round_ph(A, B, C, D, E) \
+ (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_roundscale_round_ph(A, B, C, D) \
+ (__builtin_ia32_rndscaleph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscalesh. */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_sh (__m128h __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, int __E)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D,
+ _mm_setzero_ph (), __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_round_sh (__m128h __A, __m128h __B, int __C, const int __D)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, int __E, const int __F)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E,
+ __A, __B, __F);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ int __D, const int __E)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D,
+ _mm_setzero_ph (),
+ __A, __E);
+}
+
+#else
+#define _mm_roundscale_sh(A, B, C) \
+ (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_sh(A, B, C, D, E) \
+ (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_roundscale_sh(A, B, C, D) \
+ (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D), \
+ _mm_setzero_ph (), \
+ (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_roundscale_round_sh(A, B, C, D) \
+ (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (D)))
+
+#define _mm_mask_roundscale_round_sh(A, B, C, D, E, F) \
+ (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B), (F)))
+
+#define _mm_maskz_roundscale_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D), \
+ _mm_setzero_ph (), \
+ (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclasssh. */
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_sh_mask (__m128h __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_sh_mask (__mmask8 __U, __m128h __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm, __U);
+}
+
+#else
+#define _mm_fpclass_sh_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X), \
+ (int) (C), (__mmask8) (-1))) \
+
+#define _mm_mask_fpclass_sh_mask(U, X, C) \
+ ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X), \
+ (int) (C), (__mmask8) (U)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclassph. */
+#ifdef __OPTIMIZE__
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fpclass_ph_mask (__mmask32 __U, __m512h __A,
+ const int __imm)
+{
+ return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fpclass_ph_mask (__m512h __A, const int __imm)
+{
+ return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A,
+ __imm,
+ (__mmask32) -1);
+}
+
+#else
+#define _mm512_mask_fpclass_ph_mask(u, x, c) \
+ ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
+ (int) (c),(__mmask8)(u)))
+
+#define _mm512_fpclass_ph_mask(x, c) \
+ ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
+ (int) (c),(__mmask8)-1))
+#endif /* __OPIMTIZE__ */
+
+/* Intrinsics vgetexpph, vgetexpsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_sh (__m128h __A, __m128h __B)
+{
+ return (__m128h)
+ __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__v8hf) _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+ return (__m128h)
+ __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__v8hf) __W, (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_sh (__mmask8 __U, __m128h __A, __m128h __B)
+{
+ return (__m128h)
+ __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__v8hf) _mm_setzero_ph (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_ph (__m512h __A)
+{
+ return (__m512h)
+ __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+ (__v32hf) _mm512_setzero_ph (),
+ (__mmask32) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_ph (__m512h __W, __mmask32 __U, __m512h __A)
+{
+ return (__m512h)
+ __builtin_ia32_getexpph512_mask ((__v32hf) __A, (__v32hf) __W,
+ (__mmask32) __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_ph (__mmask32 __U, __m512h __A)
+{
+ return (__m512h)
+ __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+ (__v32hf) _mm512_setzero_ph (),
+ (__mmask32) __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_round_sh (__m128h __A, __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_round_sh (__m128h __W, __mmask8 __U, __m128h __A,
+ __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_round_sh (__mmask8 __U, __m128h __A, __m128h __B,
+ const int __R)
+{
+ return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf)
+ _mm_setzero_ph (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_round_ph (__m512h __A, const int __R)
+{
+ return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+ (__v32hf)
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_round_ph (__m512h __W, __mmask32 __U, __m512h __A,
+ const int __R)
+{
+ return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+ (__v32hf) __W,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_round_ph (__mmask32 __U, __m512h __A, const int __R)
+{
+ return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+ (__v32hf)
+ _mm512_setzero_ph (),
+ (__mmask32) __U, __R);
+}
+
+#else
+#define _mm_getexp_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_getexpsh_mask_round((__v8hf)(__m128h)(A), \
+ (__v8hf)(__m128h)(B), \
+ (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, R))
+
+#define _mm_mask_getexp_round_sh(W, U, A, B, C) \
+ (__m128h)__builtin_ia32_getexpsh_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_getexp_round_sh(U, A, B, C) \
+ (__m128h)__builtin_ia32_getexpsh_mask_round(A, B, \
+ (__v8hf)_mm_setzero_ph(), \
+ U, C)
+
+#define _mm512_getexp_round_ph(A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, R))
+
+#define _mm512_mask_getexp_round_ph(W, U, A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)(__m512h)(W), (__mmask32)(U), R))
+
+#define _mm512_maskz_getexp_round_ph(U, A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), R))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vgetmantph, vgetmantsh. */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_sh (__m128h __A, __m128h __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128h)
+ __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__D << 2) | __C, _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_sh (__m128h __W, __mmask8 __U, __m128h __A,
+ __m128h __B, _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128h)
+ __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__D << 2) | __C, (__v8hf) __W,
+ __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_sh (__mmask8 __U, __m128h __A, __m128h __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128h)
+ __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__D << 2) | __C,
+ (__v8hf) _mm_setzero_ph(),
+ __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_ph (__m512h __W, __mmask32 __U, __m512h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ (__v32hf) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_ph (__mmask32 __U, __m512h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ (__v32hf)
+ _mm512_setzero_ph (),
+ __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_round_sh (__m128h __A, __m128h __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__D << 2) | __C,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_round_sh (__m128h __W, __mmask8 __U, __m128h __A,
+ __m128h __B, _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__D << 2) | __C,
+ (__v8hf) __W,
+ __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_round_sh (__mmask8 __U, __m128h __A, __m128h __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__D << 2) | __C,
+ (__v8hf)
+ _mm_setzero_ph(),
+ __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_round_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_round_ph (__m512h __W, __mmask32 __U, __m512h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ (__v32hf) __W, __U,
+ __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_round_ph (__mmask32 __U, __m512h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ (__v32hf)
+ _mm512_setzero_ph (),
+ __U, __R);
+}
+
+#else
+#define _mm512_getmant_ph(X, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h) \
+ _mm512_setzero_ph(), \
+ (__mmask32)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ph(W, U, X, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+
+#define _mm512_maskz_getmant_ph(U, X, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h) \
+ _mm512_setzero_ph(), \
+ (__mmask32)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getmant_sh(X, Y, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h) \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_sh(W, U, X, Y, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_sh(U, X, Y, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h) \
+ _mm_setzero_ph(), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getmant_round_ph(X, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h) \
+ _mm512_setzero_ph(), \
+ (__mmask32)-1, \
+ (R)))
+
+#define _mm512_mask_getmant_round_ph(W, U, X, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), \
+ (R)))
+
+
+#define _mm512_maskz_getmant_round_ph(U, X, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h) \
+ _mm512_setzero_ph(), \
+ (__mmask32)(U), \
+ (R)))
+
+#define _mm_getmant_round_sh(X, Y, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h) \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ (R)))
+
+#define _mm_mask_getmant_round_sh(W, U, X, Y, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), \
+ (R)))
+
+#define _mm_maskz_getmant_round_sh(U, X, Y, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h) \
+ _mm_setzero_ph(), \
+ (__mmask8)(U), \
+ (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vmovw. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi16_si128 (short __A)
+{
+ return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si16 (__m128i __A)
+{
+ return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, 0);
+}
+
+/* Intrinsics vmovsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_sh (__m128h __A, __mmask8 __B, _Float16 const* __C)
+{
+ return __builtin_ia32_loadsh_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_sh (__mmask8 __A, _Float16 const* __B)
+{
+ return __builtin_ia32_loadsh_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C)
+{
+ __builtin_ia32_storesh_mask (__A, __C, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sh (__m128h __A, __m128h __B)
+{
+ __A[0] = __B[0];
+ return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_vmovsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A);
+}
+
+/* Intrinsics vcvtph2dq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi32 (__m256h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi32 (__mmask16 __A, __m256h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi32 (__m256h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi32 (__m512i __A, __mmask16 __B, __m256h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi32(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2dq512_mask_round ((A), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (__mmask16)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_epi32(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2dq512_mask_round ((C), (__v16si)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epi32(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2dq512_mask_round ((B), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2udq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu32 (__m256h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu32 (__mmask16 __A, __m256h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu32 (__m256h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu32 (__m512i __A, __mmask16 __B, __m256h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu32(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2udq512_mask_round ((A), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (__mmask16)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_epu32(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2udq512_mask_round ((C), (__v16si)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu32(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2udq512_mask_round ((B), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2dq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi32 (__m256h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi32 (__mmask16 __A, __m256h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi32 (__m256h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi32 (__m512i __A, __mmask16 __B,
+ __m256h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi32(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2dq512_mask_round ((A), \
+ (__v16si) \
+ (_mm512_setzero_si512 ()), \
+ (__mmask16)(-1), (B)))
+
+#define _mm512_mask_cvtt_roundph_epi32(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2dq512_mask_round ((C), \
+ (__v16si)(A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvtt_roundph_epi32(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2dq512_mask_round ((B), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2udq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu32 (__m256h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu32 (__mmask16 __A, __m256h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu32 (__m256h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu32 (__m512i __A, __mmask16 __B,
+ __m256h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu32(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2udq512_mask_round ((A), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (__mmask16)-1, \
+ (B)))
+
+#define _mm512_mask_cvtt_roundph_epu32(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2udq512_mask_round ((C), \
+ (__v16si)(A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvtt_roundph_epu32(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2udq512_mask_round ((B), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtdq2ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_ph (__m512i __A)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_ph (__m256h __A, __mmask16 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_ph (__mmask16 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B,
+ _mm256_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi32_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi32_ph (__mmask16 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B,
+ _mm256_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepi32_ph(A, B) \
+ (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(A), \
+ _mm256_setzero_ph (), \
+ (__mmask16)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundepi32_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(C), \
+ (A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvt_roundepi32_ph(A, B, C) \
+ (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(B), \
+ _mm256_setzero_ph (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtudq2ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_ph (__m512i __A)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_ph (__m256h __A, __mmask16 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_ph (__mmask16 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B,
+ _mm256_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu32_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu32_ph (__mmask16 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B,
+ _mm256_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepu32_ph(A, B) \
+ (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)(A), \
+ _mm256_setzero_ph (), \
+ (__mmask16)-1, \
+ B))
+
+#define _mm512_mask_cvt_roundepu32_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)C, \
+ A, \
+ B, \
+ D))
+
+#define _mm512_maskz_cvt_roundepu32_ph(A, B, C) \
+ (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)B, \
+ _mm256_setzero_ph (), \
+ A, \
+ C))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2qq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi64 (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi64 (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi64(A, B) \
+ (__builtin_ia32_vcvtph2qq512_mask_round ((A), \
+ _mm512_setzero_si512 (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_epi64(A, B, C, D) \
+ (__builtin_ia32_vcvtph2qq512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epi64(A, B, C) \
+ (__builtin_ia32_vcvtph2qq512_mask_round ((B), \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2uqq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu64 (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu64(A, B) \
+ (__builtin_ia32_vcvtph2uqq512_mask_round ((A), \
+ _mm512_setzero_si512 (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_epu64(A, B, C, D) \
+ (__builtin_ia32_vcvtph2uqq512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu64(A, B, C) \
+ (__builtin_ia32_vcvtph2uqq512_mask_round ((B), \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2qq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi64 (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi64(A, B) \
+ (__builtin_ia32_vcvttph2qq512_mask_round ((A), \
+ _mm512_setzero_si512 (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvtt_roundph_epi64(A, B, C, D) \
+ __builtin_ia32_vcvttph2qq512_mask_round ((C), (A), (B), (D))
+
+#define _mm512_maskz_cvtt_roundph_epi64(A, B, C) \
+ (__builtin_ia32_vcvttph2qq512_mask_round ((B), \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2uqq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu64 (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu64(A, B) \
+ (__builtin_ia32_vcvttph2uqq512_mask_round ((A), \
+ _mm512_setzero_si512 (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvtt_roundph_epu64(A, B, C, D) \
+ __builtin_ia32_vcvttph2uqq512_mask_round ((C), (A), (B), (D))
+
+#define _mm512_maskz_cvtt_roundph_epu64(A, B, C) \
+ (__builtin_ia32_vcvttph2uqq512_mask_round ((B), \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtqq2ph. */
+extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_ph (__m512i __A)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_ph (__mmask8 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B,
+ _mm_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi64_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi64_ph (__mmask8 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B,
+ _mm_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepi64_ph(A, B) \
+ (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(A), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundepi64_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundepi64_ph(A, B, C) \
+ (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(B), \
+ _mm_setzero_ph (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtuqq2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu64_ph (__m512i __A)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu64_ph (__mmask8 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B,
+ _mm_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu64_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu64_ph (__mmask8 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B,
+ _mm_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepu64_ph(A, B) \
+ (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(A), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundepu64_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundepu64_ph(A, B, C) \
+ (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(B), \
+ _mm_setzero_ph (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2w. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi16 (__m512h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi16 (__mmask32 __A, __m512h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi16 (__m512h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi16 (__m512i __A, __mmask32 __B, __m512h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi16(A, B) \
+ ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((A), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_epi16(A, B, C, D) \
+ ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((C), \
+ (__v32hi)(A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvt_roundph_epi16(A, B, C) \
+ ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((B), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2uw. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu16 (__m512h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu16 (__mmask32 __A, __m512h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu16 (__m512h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu16 (__m512i __A, __mmask32 __B, __m512h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu16(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2uw512_mask_round ((A), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1, (B)))
+
+#define _mm512_mask_cvt_roundph_epu16(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2uw512_mask_round ((C), (__v32hi)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu16(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2uw512_mask_round ((B), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2w. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi16 (__m512h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi16 (__mmask32 __A, __m512h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi16 (__m512h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi16 (__m512i __A, __mmask32 __B,
+ __m512h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi16(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2w512_mask_round ((A), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1, \
+ (B)))
+
+#define _mm512_mask_cvtt_roundph_epi16(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2w512_mask_round ((C), \
+ (__v32hi)(A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvtt_roundph_epi16(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2w512_mask_round ((B), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2uw. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu16 (__m512h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu16 (__mmask32 __A, __m512h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu16 (__m512h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu16 (__m512i __A, __mmask32 __B,
+ __m512h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu16(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2uw512_mask_round ((A), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1, \
+ (B)))
+
+#define _mm512_mask_cvtt_roundph_epu16(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2uw512_mask_round ((C), \
+ (__v32hi)(A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvtt_roundph_epu16(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2uw512_mask_round ((B), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtw2ph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_ph (__m512i __A)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_ph (__m512h __A, __mmask32 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_ph (__mmask32 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi16_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi16_ph (__mmask32 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B,
+ _mm512_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepi16_ph(A, B) \
+ (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(A), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundepi16_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(C), \
+ (A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvt_roundepi16_ph(A, B, C) \
+ (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(B), \
+ _mm512_setzero_ph (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtuw2ph. */
+ extern __inline __m512h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+ _mm512_cvtepu16_ph (__m512i __A)
+ {
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+ }
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu16_ph (__m512h __A, __mmask32 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu16_ph (__mmask32 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu16_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu16_ph (__mmask32 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B,
+ _mm512_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepu16_ph(A, B) \
+ (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(A), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundepu16_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(C), \
+ (A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvt_roundepu16_ph(A, B, C) \
+ (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(B), \
+ _mm512_setzero_ph (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtsh2si, vcvtsh2us. */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_i32 (__m128h __A)
+{
+ return (int) __builtin_ia32_vcvtsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_u32 (__m128h __A)
+{
+ return (int) __builtin_ia32_vcvtsh2usi32_round (__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_i32 (__m128h __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvtsh2si32_round (__A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_u32 (__m128h __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvtsh2usi32_round (__A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_i32(A, B) \
+ ((int)__builtin_ia32_vcvtsh2si32_round ((A), (B)))
+#define _mm_cvt_roundsh_u32(A, B) \
+ ((int)__builtin_ia32_vcvtsh2usi32_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_i64 (__m128h __A)
+{
+ return (long long)
+ __builtin_ia32_vcvtsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_u64 (__m128h __A)
+{
+ return (long long)
+ __builtin_ia32_vcvtsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_i64 (__m128h __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvtsh2si64_round (__A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_u64 (__m128h __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvtsh2usi64_round (__A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_i64(A, B) \
+ ((long long)__builtin_ia32_vcvtsh2si64_round ((A), (B)))
+#define _mm_cvt_roundsh_u64(A, B) \
+ ((long long)__builtin_ia32_vcvtsh2usi64_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvttsh2si, vcvttsh2us. */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_i32 (__m128h __A)
+{
+ return (int)
+ __builtin_ia32_vcvttsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_u32 (__m128h __A)
+{
+ return (int)
+ __builtin_ia32_vcvttsh2usi32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_i32 (__m128h __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvttsh2si32_round (__A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_u32 (__m128h __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvttsh2usi32_round (__A, __R);
+}
+
+#else
+#define _mm_cvtt_roundsh_i32(A, B) \
+ ((int)__builtin_ia32_vcvttsh2si32_round ((A), (B)))
+#define _mm_cvtt_roundsh_u32(A, B) \
+ ((int)__builtin_ia32_vcvttsh2usi32_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_i64 (__m128h __A)
+{
+ return (long long)
+ __builtin_ia32_vcvttsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_u64 (__m128h __A)
+{
+ return (long long)
+ __builtin_ia32_vcvttsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_i64 (__m128h __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvttsh2si64_round (__A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_u64 (__m128h __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvttsh2usi64_round (__A, __R);
+}
+
+#else
+#define _mm_cvtt_roundsh_i64(A, B) \
+ ((long long)__builtin_ia32_vcvttsh2si64_round ((A), (B)))
+#define _mm_cvtt_roundsh_u64(A, B) \
+ ((long long)__builtin_ia32_vcvttsh2usi64_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvtsi2sh, vcvtusi2sh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti32_sh (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvtsi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu32_sh (__m128h __A, unsigned int __B)
+{
+ return __builtin_ia32_vcvtusi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi32_sh (__m128h __A, int __B, const int __R)
+{
+ return __builtin_ia32_vcvtsi2sh32_round (__A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu32_sh (__m128h __A, unsigned int __B, const int __R)
+{
+ return __builtin_ia32_vcvtusi2sh32_round (__A, __B, __R);
+}
+
+#else
+#define _mm_cvt_roundi32_sh(A, B, C) \
+ (__builtin_ia32_vcvtsi2sh32_round ((A), (B), (C)))
+#define _mm_cvt_roundu32_sh(A, B, C) \
+ (__builtin_ia32_vcvtusi2sh32_round ((A), (B), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti64_sh (__m128h __A, long long __B)
+{
+ return __builtin_ia32_vcvtsi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu64_sh (__m128h __A, unsigned long long __B)
+{
+ return __builtin_ia32_vcvtusi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi64_sh (__m128h __A, long long __B, const int __R)
+{
+ return __builtin_ia32_vcvtsi2sh64_round (__A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu64_sh (__m128h __A, unsigned long long __B, const int __R)
+{
+ return __builtin_ia32_vcvtusi2sh64_round (__A, __B, __R);
+}
+
+#else
+#define _mm_cvt_roundi64_sh(A, B, C) \
+ (__builtin_ia32_vcvtsi2sh64_round ((A), (B), (C)))
+#define _mm_cvt_roundu64_sh(A, B, C) \
+ (__builtin_ia32_vcvtusi2sh64_round ((A), (B), (C)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvtph2pd. */
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_pd (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__A,
+ _mm512_setzero_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_pd (__m512d __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__B,
+ _mm512_setzero_pd (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_pd (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__A,
+ _mm512_setzero_pd (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_pd (__m512d __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_pd (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__B,
+ _mm512_setzero_pd (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_pd(A, B) \
+ (__builtin_ia32_vcvtph2pd512_mask_round ((A), \
+ _mm512_setzero_pd (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_pd(A, B, C, D) \
+ (__builtin_ia32_vcvtph2pd512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_pd(A, B, C) \
+ (__builtin_ia32_vcvtph2pd512_mask_round ((B), \
+ _mm512_setzero_pd (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2psx. */
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtxph_ps (__m256h __A)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__A,
+ _mm512_setzero_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtxph_ps (__m512 __A, __mmask16 __B, __m256h __C)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtxph_ps (__mmask16 __A, __m256h __B)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__B,
+ _mm512_setzero_ps (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtx_roundph_ps (__m256h __A, int __B)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__A,
+ _mm512_setzero_ps (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtx_roundph_ps (__m512 __A, __mmask16 __B, __m256h __C, int __D)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtx_roundph_ps (__mmask16 __A, __m256h __B, int __C)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__B,
+ _mm512_setzero_ps (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtx_roundph_ps(A, B) \
+ (__builtin_ia32_vcvtph2psx512_mask_round ((A), \
+ _mm512_setzero_ps (), \
+ (__mmask16)-1, \
+ (B)))
+
+#define _mm512_mask_cvtx_roundph_ps(A, B, C, D) \
+ (__builtin_ia32_vcvtph2psx512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvtx_roundph_ps(A, B, C) \
+ (__builtin_ia32_vcvtph2psx512_mask_round ((B), \
+ _mm512_setzero_ps (), \
+ (A), \
+ (C)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtps2ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtxps_ph (__m512 __A)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtxps_ph (__m256h __A, __mmask16 __B, __m512 __C)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C,
+ __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtxps_ph (__mmask16 __A, __m512 __B)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B,
+ _mm256_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtx_roundps_ph (__m512 __A, int __B)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtx_roundps_ph (__m256h __A, __mmask16 __B, __m512 __C, int __D)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C,
+ __A, __B, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtx_roundps_ph (__mmask16 __A, __m512 __B, int __C)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B,
+ _mm256_setzero_ph (),
+ __A, __C);
+}
+
+#else
+#define _mm512_cvtx_roundps_ph(A, B) \
+ (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(A), \
+ _mm256_setzero_ph (),\
+ (__mmask16)-1, (B)))
+
+#define _mm512_mask_cvtx_roundps_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(C), \
+ (A), (B), (D)))
+
+#define _mm512_maskz_cvtx_roundps_ph(A, B, C) \
+ (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(B), \
+ _mm256_setzero_ph (),\
+ (A), (C)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtpd2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_ph (__m512d __A)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m512d __C)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C,
+ __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_ph (__mmask8 __A, __m512d __B)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B,
+ _mm_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_ph (__m512d __A, int __B)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_ph (__m128h __A, __mmask8 __B, __m512d __C, int __D)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C,
+ __A, __B, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_ph (__mmask8 __A, __m512d __B, int __C)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B,
+ _mm_setzero_ph (),
+ __A, __C);
+}
+
+#else
+#define _mm512_cvt_roundpd_ph(A, B) \
+ (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(A), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (B)))
+
+#define _mm512_mask_cvt_roundpd_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(C), \
+ (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundpd_ph(A, B, C) \
+ (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(B), \
+ _mm_setzero_ph (), \
+ (A), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtsh2ss, vcvtsh2sd. */
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_ss (__m128 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A,
+ _mm_setzero_ps (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsh_ss (__m128 __A, __mmask8 __B, __m128 __C,
+ __m128h __D)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsh_ss (__mmask8 __A, __m128 __B,
+ __m128h __C)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B,
+ _mm_setzero_ps (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_sd (__m128d __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A,
+ _mm_setzero_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsh_sd (__m128d __A, __mmask8 __B, __m128d __C,
+ __m128h __D)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsh_sd (__mmask8 __A, __m128d __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B,
+ _mm_setzero_pd (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_ss (__m128 __A, __m128h __B, const int __R)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A,
+ _mm_setzero_ps (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsh_ss (__m128 __A, __mmask8 __B, __m128 __C,
+ __m128h __D, const int __R)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsh_ss (__mmask8 __A, __m128 __B,
+ __m128h __C, const int __R)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B,
+ _mm_setzero_ps (),
+ __A, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_sd (__m128d __A, __m128h __B, const int __R)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A,
+ _mm_setzero_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsh_sd (__m128d __A, __mmask8 __B, __m128d __C,
+ __m128h __D, const int __R)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsh_sd (__mmask8 __A, __m128d __B, __m128h __C, const int __R)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B,
+ _mm_setzero_pd (),
+ __A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_ss(A, B, R) \
+ (__builtin_ia32_vcvtsh2ss_mask_round ((B), (A), \
+ _mm_setzero_ps (), \
+ (__mmask8) -1, (R)))
+
+#define _mm_mask_cvt_roundsh_ss(A, B, C, D, R) \
+ (__builtin_ia32_vcvtsh2ss_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsh_ss(A, B, C, R) \
+ (__builtin_ia32_vcvtsh2ss_mask_round ((C), (B), \
+ _mm_setzero_ps (), \
+ (A), (R)))
+
+#define _mm_cvt_roundsh_sd(A, B, R) \
+ (__builtin_ia32_vcvtsh2sd_mask_round ((B), (A), \
+ _mm_setzero_pd (), \
+ (__mmask8) -1, (R)))
+
+#define _mm_mask_cvt_roundsh_sd(A, B, C, D, R) \
+ (__builtin_ia32_vcvtsh2sd_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsh_sd(A, B, C, R) \
+ (__builtin_ia32_vcvtsh2sd_mask_round ((C), (B), \
+ _mm_setzero_pd (), \
+ (A), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtss2sh, vcvtsd2sh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_sh (__m128h __A, __m128 __B)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtss_sh (__mmask8 __A, __m128h __B, __m128 __C)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_sh (__m128h __A, __m128d __B)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsd_sh (__mmask8 __A, __m128h __B, __m128d __C)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_sh (__m128h __A, __m128 __B, const int __R)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D,
+ const int __R)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundss_sh (__mmask8 __A, __m128h __B, __m128 __C,
+ const int __R)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_sh (__m128h __A, __m128d __B, const int __R)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D,
+ const int __R)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsd_sh (__mmask8 __A, __m128h __B, __m128d __C,
+ const int __R)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, __R);
+}
+
+#else
+#define _mm_cvt_roundss_sh(A, B, R) \
+ (__builtin_ia32_vcvtss2sh_mask_round ((B), (A), \
+ _mm_setzero_ph (), \
+ (__mmask8) -1, R))
+
+#define _mm_mask_cvt_roundss_sh(A, B, C, D, R) \
+ (__builtin_ia32_vcvtss2sh_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundss_sh(A, B, C, R) \
+ (__builtin_ia32_vcvtss2sh_mask_round ((C), (B), \
+ _mm_setzero_ph (), \
+ A, R))
+
+#define _mm_cvt_roundsd_sh(A, B, R) \
+ (__builtin_ia32_vcvtsd2sh_mask_round ((B), (A), \
+ _mm_setzero_ph (), \
+ (__mmask8) -1, R))
+
+#define _mm_mask_cvt_roundsd_sh(A, B, C, D, R) \
+ (__builtin_ia32_vcvtsd2sh_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsd_sh(A, B, C, R) \
+ (__builtin_ia32_vcvtsd2sh_mask_round ((C), (B), \
+ _mm_setzero_ph (), \
+ (A), (R)))
+
+#endif /* __OPTIMIZE__ */
+
#ifdef __DISABLE_AVX512FP16__
#undef __DISABLE_AVX512FP16__
#pragma GCC pop_options
diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h
index 1787ed5..59906d2 100644
--- a/gcc/config/i386/avx512fp16vlintrin.h
+++ b/gcc/config/i386/avx512fp16vlintrin.h
@@ -34,6 +34,123 @@
#define __DISABLE_AVX512FP16VL__
#endif /* __AVX512FP16VL__ */
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_ps (__m128h __a)
+{
+ return (__m128) __a;
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_ps (__m256h __a)
+{
+ return (__m256) __a;
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_pd (__m128h __a)
+{
+ return (__m128d) __a;
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_pd (__m256h __a)
+{
+ return (__m256d) __a;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_si128 (__m128h __a)
+{
+ return (__m128i) __a;
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_si256 (__m256h __a)
+{
+ return (__m256i) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_ph (__m128 __a)
+{
+ return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_ph (__m256 __a)
+{
+ return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ph (__m128d __a)
+{
+ return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_ph (__m256d __a)
+{
+ return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ph (__m128i __a)
+{
+ return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_ph (__m256i __a)
+{
+ return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph256_ph128 (__m256h __A)
+{
+ union
+ {
+ __m128h a[2];
+ __m256h v;
+ } u = { .v = __A };
+ return u.a[0];
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph128_ph256 (__m128h __A)
+{
+ union
+ {
+ __m128h a[2];
+ __m256h v;
+ } u;
+ u.a[0] = __A;
+ return u.v;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextph128_ph256 (__m128h __A)
+{
+ return (__m256h) _mm256_insertf128_ps (_mm256_setzero_ps (),
+ (__m128) __A, 0);
+}
+
/* Intrinsics v[add,sub,mul,div]ph. */
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -53,30 +170,30 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vaddph_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_addph128_mask (__C, __D, __A, __B);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
{
- return __builtin_ia32_vaddph_v16hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_addph256_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vaddph_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_addph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C)
{
- return __builtin_ia32_vaddph_v16hf_mask (__B, __C,
- _mm256_setzero_ph (), __A);
+ return __builtin_ia32_addph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
}
extern __inline __m128h
@@ -97,30 +214,30 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vsubph_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_subph128_mask (__C, __D, __A, __B);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
{
- return __builtin_ia32_vsubph_v16hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_subph256_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vsubph_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_subph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C)
{
- return __builtin_ia32_vsubph_v16hf_mask (__B, __C,
- _mm256_setzero_ph (), __A);
+ return __builtin_ia32_subph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
}
extern __inline __m128h
@@ -141,30 +258,30 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vmulph_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_mulph128_mask (__C, __D, __A, __B);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
{
- return __builtin_ia32_vmulph_v16hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_mulph256_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vmulph_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_mulph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C)
{
- return __builtin_ia32_vmulph_v16hf_mask (__B, __C,
- _mm256_setzero_ph (), __A);
+ return __builtin_ia32_mulph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
}
extern __inline __m128h
@@ -185,30 +302,30 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vdivph_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_divph128_mask (__C, __D, __A, __B);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
{
- return __builtin_ia32_vdivph_v16hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_divph256_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vdivph_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_divph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C)
{
- return __builtin_ia32_vdivph_v16hf_mask (__B, __C,
- _mm256_setzero_ph (), __A);
+ return __builtin_ia32_divph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
}
/* Intrinsics v[max,min]ph. */
@@ -216,96 +333,96 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_ph (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vmaxph_v8hf_mask (__A, __B,
- _mm_setzero_ph (),
- (__mmask8) -1);
+ return __builtin_ia32_maxph128_mask (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_max_ph (__m256h __A, __m256h __B)
{
- return __builtin_ia32_vmaxph_v16hf_mask (__A, __B,
- _mm256_setzero_ph (),
- (__mmask16) -1);
+ return __builtin_ia32_maxph256_mask (__A, __B,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_max_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vmaxph_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_maxph128_mask (__C, __D, __A, __B);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_max_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
{
- return __builtin_ia32_vmaxph_v16hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_maxph256_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_max_ph (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vmaxph_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_maxph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_max_ph (__mmask16 __A, __m256h __B, __m256h __C)
{
- return __builtin_ia32_vmaxph_v16hf_mask (__B, __C,
- _mm256_setzero_ph (), __A);
+ return __builtin_ia32_maxph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_ph (__m128h __A, __m128h __B)
{
- return __builtin_ia32_vminph_v8hf_mask (__A, __B,
- _mm_setzero_ph (),
- (__mmask8) -1);
+ return __builtin_ia32_minph128_mask (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_min_ph (__m256h __A, __m256h __B)
{
- return __builtin_ia32_vminph_v16hf_mask (__A, __B,
- _mm256_setzero_ph (),
- (__mmask16) -1);
+ return __builtin_ia32_minph256_mask (__A, __B,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_min_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
{
- return __builtin_ia32_vminph_v8hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_minph128_mask (__C, __D, __A, __B);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_min_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
{
- return __builtin_ia32_vminph_v16hf_mask (__C, __D, __A, __B);
+ return __builtin_ia32_minph256_mask (__C, __D, __A, __B);
}
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_min_ph (__mmask8 __A, __m128h __B, __m128h __C)
{
- return __builtin_ia32_vminph_v8hf_mask (__B, __C, _mm_setzero_ph (),
- __A);
+ return __builtin_ia32_minph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C)
{
- return __builtin_ia32_vminph_v16hf_mask (__B, __C,
- _mm256_setzero_ph (), __A);
+ return __builtin_ia32_minph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
}
/* vcmpph */
@@ -314,8 +431,8 @@ extern __inline __mmask8
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_ph_mask (__m128h __A, __m128h __B, const int __C)
{
- return (__mmask8) __builtin_ia32_vcmpph_v8hf_mask (__A, __B, __C,
- (__mmask8) -1);
+ return (__mmask8) __builtin_ia32_cmpph128_mask (__A, __B, __C,
+ (__mmask8) -1);
}
extern __inline __mmask8
@@ -323,15 +440,15 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_ph_mask (__mmask8 __A, __m128h __B, __m128h __C,
const int __D)
{
- return (__mmask8) __builtin_ia32_vcmpph_v8hf_mask (__B, __C, __D, __A);
+ return (__mmask8) __builtin_ia32_cmpph128_mask (__B, __C, __D, __A);
}
extern __inline __mmask16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmp_ph_mask (__m256h __A, __m256h __B, const int __C)
{
- return (__mmask16) __builtin_ia32_vcmpph_v16hf_mask (__A, __B, __C,
- (__mmask16) -1);
+ return (__mmask16) __builtin_ia32_cmpph256_mask (__A, __B, __C,
+ (__mmask16) -1);
}
extern __inline __mmask16
@@ -339,25 +456,1819 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cmp_ph_mask (__mmask16 __A, __m256h __B, __m256h __C,
const int __D)
{
- return (__mmask16) __builtin_ia32_vcmpph_v16hf_mask (__B, __C, __D,
- __A);
+ return (__mmask16) __builtin_ia32_cmpph256_mask (__B, __C, __D,
+ __A);
}
#else
-#define _mm_cmp_ph_mask(A, B, C) \
- (__builtin_ia32_vcmpph_v8hf_mask ((A), (B), (C), (-1)))
+#define _mm_cmp_ph_mask(A, B, C) \
+ (__builtin_ia32_cmpph128_mask ((A), (B), (C), (-1)))
-#define _mm_mask_cmp_ph_mask(A, B, C, D) \
- (__builtin_ia32_vcmpph_v8hf_mask ((B), (C), (D), (A)))
+#define _mm_mask_cmp_ph_mask(A, B, C, D) \
+ (__builtin_ia32_cmpph128_mask ((B), (C), (D), (A)))
-#define _mm256_cmp_ph_mask(A, B, C) \
- (__builtin_ia32_vcmpph_v16hf_mask ((A), (B), (C), (-1)))
+#define _mm256_cmp_ph_mask(A, B, C) \
+ (__builtin_ia32_cmpph256_mask ((A), (B), (C), (-1)))
-#define _mm256_mask_cmp_ph_mask(A, B, C, D) \
- (__builtin_ia32_vcmpph_v16hf_mask ((B), (C), (D), (A)))
+#define _mm256_mask_cmp_ph_mask(A, B, C, D) \
+ (__builtin_ia32_cmpph256_mask ((B), (C), (D), (A)))
#endif /* __OPTIMIZE__ */
+/* Intrinsics vsqrtph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ph (__m128h __A)
+{
+ return __builtin_ia32_sqrtph128_mask (__A, _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_ph (__m256h __A)
+{
+ return __builtin_ia32_sqrtph256_mask (__A, _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_sqrtph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+ return __builtin_ia32_sqrtph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_ph (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_sqrtph128_mask (__B, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_ph (__mmask16 __A, __m256h __B)
+{
+ return __builtin_ia32_sqrtph256_mask (__B, _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vrsqrtph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ph (__m128h __A)
+{
+ return __builtin_ia32_rsqrtph128_mask (__A, _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt_ph (__m256h __A)
+{
+ return __builtin_ia32_rsqrtph256_mask (__A, _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_rsqrtph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rsqrt_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+ return __builtin_ia32_rsqrtph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt_ph (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_rsqrtph128_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rsqrt_ph (__mmask16 __A, __m256h __B)
+{
+ return __builtin_ia32_rsqrtph256_mask (__B, _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vrcpph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ph (__m128h __A)
+{
+ return __builtin_ia32_rcpph128_mask (__A, _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp_ph (__m256h __A)
+{
+ return __builtin_ia32_rcpph256_mask (__A, _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_rcpph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rcp_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+ return __builtin_ia32_rcpph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp_ph (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_rcpph128_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rcp_ph (__mmask16 __A, __m256h __B)
+{
+ return __builtin_ia32_rcpph256_mask (__B, _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vscalefph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_ph (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_scalefph128_mask (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_ph (__m256h __A, __m256h __B)
+{
+ return __builtin_ia32_scalefph256_mask (__A, __B,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_scalefph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_ph (__m256h __A, __mmask16 __B, __m256h __C,
+ __m256h __D)
+{
+ return __builtin_ia32_scalefph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_scalefph128_mask (__B, __C,
+ _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+ return __builtin_ia32_scalefph256_mask (__B, __C,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vreduceph. */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_ph (__m128h __A, int __B)
+{
+ return __builtin_ia32_reduceph128_mask (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_reduceph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_ph (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_reduceph128_mask (__B, __C,
+ _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_ph (__m256h __A, int __B)
+{
+ return __builtin_ia32_reduceph256_mask (__A, __B,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_ph (__m256h __A, __mmask16 __B, __m256h __C, int __D)
+{
+ return __builtin_ia32_reduceph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_ph (__mmask16 __A, __m256h __B, int __C)
+{
+ return __builtin_ia32_reduceph256_mask (__B, __C,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+#else
+#define _mm_reduce_ph(A, B) \
+ (__builtin_ia32_reduceph128_mask ((A), (B), \
+ _mm_setzero_ph (), \
+ ((__mmask8)-1)))
+
+#define _mm_mask_reduce_ph(A, B, C, D) \
+ (__builtin_ia32_reduceph128_mask ((C), (D), (A), (B)))
+
+#define _mm_maskz_reduce_ph(A, B, C) \
+ (__builtin_ia32_reduceph128_mask ((B), (C), _mm_setzero_ph (), (A)))
+
+#define _mm256_reduce_ph(A, B) \
+ (__builtin_ia32_reduceph256_mask ((A), (B), \
+ _mm256_setzero_ph (), \
+ ((__mmask16)-1)))
+
+#define _mm256_mask_reduce_ph(A, B, C, D) \
+ (__builtin_ia32_reduceph256_mask ((C), (D), (A), (B)))
+
+#define _mm256_maskz_reduce_ph(A, B, C) \
+ (__builtin_ia32_reduceph256_mask ((B), (C), _mm256_setzero_ph (), (A)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscaleph. */
+#ifdef __OPTIMIZE__
+ extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+ _mm_roundscale_ph (__m128h __A, int __B)
+ {
+ return __builtin_ia32_rndscaleph128_mask (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+ }
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_rndscaleph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_ph (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_rndscaleph128_mask (__B, __C,
+ _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_roundscale_ph (__m256h __A, int __B)
+{
+ return __builtin_ia32_rndscaleph256_mask (__A, __B,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_roundscale_ph (__m256h __A, __mmask16 __B, __m256h __C,
+ int __D)
+{
+ return __builtin_ia32_rndscaleph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_roundscale_ph (__mmask16 __A, __m256h __B, int __C)
+{
+ return __builtin_ia32_rndscaleph256_mask (__B, __C,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+#else
+#define _mm_roundscale_ph(A, B) \
+ (__builtin_ia32_rndscaleph128_mask ((A), (B), _mm_setzero_ph (), \
+ ((__mmask8)-1)))
+
+#define _mm_mask_roundscale_ph(A, B, C, D) \
+ (__builtin_ia32_rndscaleph128_mask ((C), (D), (A), (B)))
+
+#define _mm_maskz_roundscale_ph(A, B, C) \
+ (__builtin_ia32_rndscaleph128_mask ((B), (C), _mm_setzero_ph (), (A)))
+
+#define _mm256_roundscale_ph(A, B) \
+ (__builtin_ia32_rndscaleph256_mask ((A), (B), \
+ _mm256_setzero_ph(), \
+ ((__mmask16)-1)))
+
+#define _mm256_mask_roundscale_ph(A, B, C, D) \
+ (__builtin_ia32_rndscaleph256_mask ((C), (D), (A), (B)))
+
+#define _mm256_maskz_roundscale_ph(A, B, C) \
+ (__builtin_ia32_rndscaleph256_mask ((B), (C), \
+ _mm256_setzero_ph (), (A)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclassph. */
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+ _mm_mask_fpclass_ph_mask (__mmask8 __U, __m128h __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_ph_mask (__m128h __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fpclass_ph_mask (__mmask16 __U, __m256h __A, const int __imm)
+{
+ return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fpclass_ph_mask (__m256h __A, const int __imm)
+{
+ return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A,
+ __imm,
+ (__mmask16) -1);
+}
+
+#else
+#define _mm_fpclass_ph_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X), \
+ (int) (C),(__mmask8)-1))
+
+#define _mm_mask_fpclass_ph_mask(u, X, C) \
+ ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X), \
+ (int) (C),(__mmask8)(u)))
+
+#define _mm256_fpclass_ph_mask(X, C) \
+ ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \
+ (int) (C),(__mmask16)-1))
+
+#define _mm256_mask_fpclass_ph_mask(u, X, C) \
+ ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \
+ (int) (C),(__mmask16)(u)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vgetexpph, vgetexpsh. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_ph (__m256h __A)
+{
+ return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_ph (__m256h __W, __mmask16 __U, __m256h __A)
+{
+ return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+ (__v16hf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_ph (__mmask16 __U, __m256h __A)
+{
+ return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_ph (__m128h __A)
+{
+ return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+ (__v8hf)
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_ph (__m128h __W, __mmask8 __U, __m128h __A)
+{
+ return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+ (__v8hf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_ph (__mmask8 __U, __m128h __A)
+{
+ return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+ (__v8hf)
+ _mm_setzero_ph (),
+ (__mmask8) __U);
+}
+
+
+/* Intrinsics vgetmantph, vgetmantsh. */
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getmant_ph (__m256h __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+ (__C << 2) | __B,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getmant_ph (__m256h __W, __mmask16 __U, __m256h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+ (__C << 2) | __B,
+ (__v16hf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getmant_ph (__mmask16 __U, __m256h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+ (__C << 2) | __B,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_ph (__m128h __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+ (__C << 2) | __B,
+ (__v8hf)
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_ph (__m128h __W, __mmask8 __U, __m128h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+ (__C << 2) | __B,
+ (__v8hf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_ph (__mmask8 __U, __m128h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+ (__C << 2) | __B,
+ (__v8hf)
+ _mm_setzero_ph (),
+ (__mmask8) __U);
+}
+
+#else
+#define _mm256_getmant_ph(X, B, C) \
+ ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16hf)(__m256h)_mm256_setzero_ph (), \
+ (__mmask16)-1))
+
+#define _mm256_mask_getmant_ph(W, U, X, B, C) \
+ ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16hf)(__m256h)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_getmant_ph(U, X, B, C) \
+ ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16hf)(__m256h)_mm256_setzero_ph (), \
+ (__mmask16)(U)))
+
+#define _mm_getmant_ph(X, B, C) \
+ ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8hf)(__m128h)_mm_setzero_ph (), \
+ (__mmask8)-1))
+
+#define _mm_mask_getmant_ph(W, U, X, B, C) \
+ ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8hf)(__m128h)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_getmant_ph(U, X, B, C) \
+ ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8hf)(__m128h)_mm_setzero_ph (), \
+ (__mmask8)(U)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2dq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi32 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2dq128_mask (__A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2dq128_mask (__C, ( __v4si) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2dq128_mask (__B,
+ (__v4si) _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi32 (__m128h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2dq256_mask (__A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2dq256_mask (__C, ( __v8si) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2dq256_mask (__B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtph2udq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu32 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2udq128_mask (__A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2udq128_mask (__C, ( __v4si) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2udq128_mask (__B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu32 (__m128h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2udq256_mask (__A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2udq256_mask (__C, ( __v8si) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2udq256_mask (__B,
+ (__v8si) _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2dq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi32 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2dq128_mask (__A,
+ (__v4si) _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)__builtin_ia32_vcvttph2dq128_mask (__C,
+ ( __v4si) __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2dq128_mask (__B,
+ (__v4si) _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi32 (__m128h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2dq256_mask (__A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2dq256_mask (__C,
+ ( __v8si) __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2dq256_mask (__B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2udq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu32 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2udq128_mask (__A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2udq128_mask (__C,
+ ( __v4si) __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2udq128_mask (__B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu32 (__m128h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2udq256_mask (__A,
+ (__v8si)
+ _mm256_setzero_si256 (), (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2udq256_mask (__C,
+ ( __v8si) __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2udq256_mask (__B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtdq2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_ph (__mmask8 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtudq2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __C,
+ __A,
+ __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu32_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu32_ph (__mmask8 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtph2qq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi64 (__m128h __A)
+{
+ return
+ __builtin_ia32_vcvtph2qq128_mask (__A,
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2qq128_mask (__C, __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2qq128_mask (__B,
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi64 (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2qq256_mask (__A,
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2qq256_mask (__C, __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2qq256_mask (__B,
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtph2uqq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2uqq128_mask (__A,
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2uqq128_mask (__C, __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2uqq128_mask (__B,
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2uqq256_mask (__A,
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2uqq256_mask (__C, __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2uqq256_mask (__B,
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2qq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2qq128_mask (__A,
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2qq128_mask (__C,
+ __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2qq128_mask (__B,
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2qq256_mask (__A,
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2qq256_mask (__C,
+ __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2qq256_mask (__B,
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2uqq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2uqq128_mask (__A,
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2uqq128_mask (__C,
+ __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2uqq128_mask (__B,
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2uqq256_mask (__A,
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2uqq256_mask (__C,
+ __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2uqq256_mask (__B,
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtqq2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_ph (__mmask8 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtuqq2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu64_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu64_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu64_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu64_ph (__mmask8 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtph2w. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi16 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2w128_mask (__A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2w128_mask (__C, ( __v8hi) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi16 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2w128_mask (__B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi16 (__m256h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2w256_mask (__A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2w256_mask (__C, ( __v16hi) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi16 (__mmask16 __A, __m256h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2w256_mask (__B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtph2uw. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu16 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2uw128_mask (__A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2uw128_mask (__C, ( __v8hi) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu16 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2uw128_mask (__B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu16 (__m256h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2uw256_mask (__A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2uw256_mask (__C, ( __v16hi) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu16 (__mmask16 __A, __m256h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2uw256_mask (__B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2w. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi16 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2w128_mask (__A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2w128_mask (__C,
+ ( __v8hi) __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi16 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2w128_mask (__B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi16 (__m256h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2w256_mask (__A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2w256_mask (__C,
+ ( __v16hi) __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi16 (__mmask16 __A, __m256h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2w256_mask (__B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2uw. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu16 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2uw128_mask (__A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2uw128_mask (__C,
+ ( __v8hi) __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu16 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2uw128_mask (__B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu16 (__m256h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2uw256_mask (__A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2uw256_mask (__C,
+ ( __v16hi) __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu16 (__mmask16 __A, __m256h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2uw256_mask (__B,
+ (__v16hi) _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtw2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __C,
+ __A,
+ __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi16_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_ph (__m256h __A, __mmask16 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __C,
+ __A,
+ __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi16_ph (__mmask16 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __B,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtuw2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu16_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu16_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu16_ph (__m256h __A, __mmask16 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu16_ph (__mmask16 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __B,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtph2pd. */
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_pd (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2pd128_mask (__A,
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_pd (__m128d __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2pd128_mask (__C, __A, __B);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2pd128_mask (__B, _mm_setzero_pd (), __A);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_pd (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2pd256_mask (__A,
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_pd (__m256d __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2pd256_mask (__C, __A, __B);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2pd256_mask (__B,
+ _mm256_setzero_pd (),
+ __A);
+}
+
+/* Intrinsics vcvtph2ps. */
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtxph_ps (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2psx128_mask (__A,
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtxph_ps (__m128 __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2psx128_mask (__C, __A, __B);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtxph_ps (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2psx128_mask (__B, _mm_setzero_ps (), __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtxph_ps (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2psx256_mask (__A,
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtxph_ps (__m256 __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2psx256_mask (__C, __A, __B);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtxph_ps (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2psx256_mask (__B,
+ _mm256_setzero_ps (),
+ __A);
+}
+
+/* Intrinsics vcvtxps2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtxps_ph (__m128 __A)
+{
+ return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m128 __C)
+{
+ return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtxps_ph (__mmask8 __A, __m128 __B)
+{
+ return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtxps_ph (__m256 __A)
+{
+ return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m256 __C)
+{
+ return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtxps_ph (__mmask8 __A, __m256 __B)
+{
+ return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtpd2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_ph (__m128d __A)
+{
+ return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m128d __C)
+{
+ return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_ph (__mmask8 __A, __m128d __B)
+{
+ return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_ph (__m256d __A)
+{
+ return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m256d __C)
+{
+ return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_ph (__mmask8 __A, __m256d __B)
+{
+ return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
#ifdef __DISABLE_AVX512FP16VL__
#undef __DISABLE_AVX512FP16VL__
#pragma GCC pop_options
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index d11c02b..7fd4286 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -134,6 +134,7 @@ DEF_POINTER_TYPE (PCVOID, VOID, CONST)
DEF_POINTER_TYPE (PVOID, VOID)
DEF_POINTER_TYPE (PDOUBLE, DOUBLE)
DEF_POINTER_TYPE (PFLOAT, FLOAT)
+DEF_POINTER_TYPE (PCFLOAT16, FLOAT16, CONST)
DEF_POINTER_TYPE (PSHORT, SHORT)
DEF_POINTER_TYPE (PUSHORT, USHORT)
DEF_POINTER_TYPE (PINT, INT)
@@ -1304,17 +1305,72 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID)
# FP16 builtins
DEF_FUNCTION_TYPE (V8HF, V8HI)
+DEF_FUNCTION_TYPE (QI, V8HF, INT, UQI)
+DEF_FUNCTION_TYPE (HI, V16HF, INT, UHI)
+DEF_FUNCTION_TYPE (SI, V32HF, INT, USI)
+DEF_FUNCTION_TYPE (INT, V8HF, INT)
+DEF_FUNCTION_TYPE (INT64, V8HF, INT)
+DEF_FUNCTION_TYPE (UINT, V8HF, INT)
+DEF_FUNCTION_TYPE (UINT64, V8HF, INT)
DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF)
+DEF_FUNCTION_TYPE (VOID, PCFLOAT16, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, PCFLOAT16, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V8HF, INT, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, INT64, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, UINT, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, UINT64, INT)
+DEF_FUNCTION_TYPE (V2DI, V8HF, V2DI, UQI)
+DEF_FUNCTION_TYPE (V4DI, V8HF, V4DI, UQI)
+DEF_FUNCTION_TYPE (V2DF, V8HF, V2DF, UQI)
+DEF_FUNCTION_TYPE (V4DF, V8HF, V4DF, UQI)
+DEF_FUNCTION_TYPE (V4SI, V8HF, V4SI, UQI)
+DEF_FUNCTION_TYPE (V4SF, V8HF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V8SI, V8HF, V8SI, UQI)
+DEF_FUNCTION_TYPE (V8SF, V8HF, V8SF, UQI)
+DEF_FUNCTION_TYPE (V8HI, V8HF, V8HI, UQI)
+DEF_FUNCTION_TYPE (V8HF, V4SI, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V4SF, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V8SI, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V8SF, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V2DI, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V4DI, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V2DF, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V4DF, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V8HI, V8HF, UQI)
+DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, UQI)
DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, INT, V8HF, UQI)
DEF_FUNCTION_TYPE (UQI, V8HF, V8HF, INT, UQI)
DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI)
DEF_FUNCTION_TYPE (UQI, V8HF, V8HF, INT, UQI, INT)
+DEF_FUNCTION_TYPE (V8DI, V8HF, V8DI, UQI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8HF, V8DF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V8DI, V8HF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V8DF, V8HF, UQI, INT)
DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V2DF, V8HF, V8HF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V4SF, V8HF, V8HF, UQI, INT)
+DEF_FUNCTION_TYPE (V2DF, V8HF, V2DF, V2DF, UQI, INT)
+DEF_FUNCTION_TYPE (V4SF, V8HF, V4SF, V4SF, UQI, INT)
+DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT, V8HF, UQI, INT)
DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF)
+DEF_FUNCTION_TYPE (V16HI, V16HF, V16HI, UHI)
+DEF_FUNCTION_TYPE (V16HF, V16HI, V16HF, UHI)
+DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, UHI)
+DEF_FUNCTION_TYPE (V16SI, V16HF, V16SI, UHI, INT)
+DEF_FUNCTION_TYPE (V16SF, V16HF, V16SF, UHI, INT)
+DEF_FUNCTION_TYPE (V16HF, V16HF, INT, V16HF, UHI)
DEF_FUNCTION_TYPE (UHI, V16HF, V16HF, INT, UHI)
+DEF_FUNCTION_TYPE (V16HF, V16SI, V16HF, UHI, INT)
+DEF_FUNCTION_TYPE (V16HF, V16SF, V16HF, UHI, INT)
DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI)
DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT)
+DEF_FUNCTION_TYPE (V32HI, V32HF, V32HI, USI, INT)
+DEF_FUNCTION_TYPE (V32HF, V32HI, V32HF, USI, INT)
DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI)
+DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI, INT)
DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI)
DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT)
DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
+DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index c9d80cb..dc56dc2 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -393,6 +393,10 @@ BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mas
BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI)
BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI)
+/* AVX512FP16 */
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_loadhf_mask, "__builtin_ia32_loadsh_mask", IX86_BUILTIN_LOADSH_MASK, UNKNOWN, (int) V8HF_FTYPE_PCFLOAT16_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_storehf_mask, "__builtin_ia32_storesh_mask", IX86_BUILTIN_STORESH_MASK, UNKNOWN, (int) VOID_FTYPE_PCFLOAT16_V8HF_UQI)
+
/* RDPKRU and WRPKRU. */
BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_rdpkru, "__builtin_ia32_rdpkru", IX86_BUILTIN_RDPKRU, UNKNOWN, (int) UNSIGNED_FTYPE_VOID)
BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_wrpkru, "__builtin_ia32_wrpkru", IX86_BUILTIN_WRPKRU, UNKNOWN, (int) VOID_FTYPE_UNSIGNED)
@@ -2775,33 +2779,102 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__b
BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
/* AVX512FP16. */
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_vaddph_v8hf_mask", IX86_BUILTIN_VADDPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_vaddph_v16hf_mask", IX86_BUILTIN_VADDPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_vaddph_v32hf_mask", IX86_BUILTIN_VADDPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_vsubph_v8hf_mask", IX86_BUILTIN_VSUBPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_vsubph_v16hf_mask", IX86_BUILTIN_VSUBPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_vsubph_v32hf_mask", IX86_BUILTIN_VSUBPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_vmulph_v8hf_mask", IX86_BUILTIN_VMULPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_vmulph_v16hf_mask", IX86_BUILTIN_VMULPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_vmulph_v32hf_mask", IX86_BUILTIN_VMULPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_vdivph_v8hf_mask", IX86_BUILTIN_VDIVPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_vdivph_v16hf_mask", IX86_BUILTIN_VDIVPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_vdivph_v32hf_mask", IX86_BUILTIN_VDIVPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask, "__builtin_ia32_vaddsh_v8hf_mask", IX86_BUILTIN_VADDSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask, "__builtin_ia32_vsubsh_v8hf_mask", IX86_BUILTIN_VSUBSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask, "__builtin_ia32_vmulsh_v8hf_mask", IX86_BUILTIN_VMULSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask, "__builtin_ia32_vdivsh_v8hf_mask", IX86_BUILTIN_VDIVSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv8hf3_mask, "__builtin_ia32_vmaxph_v8hf_mask", IX86_BUILTIN_VMAXPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv16hf3_mask, "__builtin_ia32_vmaxph_v16hf_mask", IX86_BUILTIN_VMAXPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask, "__builtin_ia32_vmaxph_v32hf_mask", IX86_BUILTIN_VMAXPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv8hf3_mask, "__builtin_ia32_vminph_v8hf_mask", IX86_BUILTIN_VMINPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv16hf3_mask, "__builtin_ia32_vminph_v16hf_mask", IX86_BUILTIN_VMINPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask, "__builtin_ia32_vminph_v32hf_mask", IX86_BUILTIN_VMINPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask, "__builtin_ia32_vmaxsh_v8hf_mask", IX86_BUILTIN_VMAXSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask, "__builtin_ia32_vminsh_v8hf_mask", IX86_BUILTIN_VMINSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_cmpv8hf3_mask, "__builtin_ia32_vcmpph_v8hf_mask", IX86_BUILTIN_VCMPPH_V8HF_MASK, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI)
-BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_cmpv16hf3_mask, "__builtin_ia32_vcmpph_v16hf_mask", IX86_BUILTIN_VCMPPH_V16HF_MASK, UNKNOWN, (int) UHI_FTYPE_V16HF_V16HF_INT_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask, "__builtin_ia32_vcmpph_v32hf_mask", IX86_BUILTIN_VCMPPH_V32HF_MASK, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_addph256_mask", IX86_BUILTIN_ADDPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_addph512_mask", IX86_BUILTIN_ADDPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_subph128_mask", IX86_BUILTIN_SUBPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_subph256_mask", IX86_BUILTIN_SUBPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_subph512_mask", IX86_BUILTIN_SUBPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_mulph128_mask", IX86_BUILTIN_MULPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_mulph256_mask", IX86_BUILTIN_MULPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_mulph512_mask", IX86_BUILTIN_MULPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_divph128_mask", IX86_BUILTIN_DIVPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_divph256_mask", IX86_BUILTIN_DIVPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_divph512_mask", IX86_BUILTIN_DIVPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask, "__builtin_ia32_addsh_mask", IX86_BUILTIN_ADDSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask, "__builtin_ia32_subsh_mask", IX86_BUILTIN_SUBSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask, "__builtin_ia32_mulsh_mask", IX86_BUILTIN_MULSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask, "__builtin_ia32_divsh_mask", IX86_BUILTIN_DIVSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv8hf3_mask, "__builtin_ia32_maxph128_mask", IX86_BUILTIN_MAXPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv16hf3_mask, "__builtin_ia32_maxph256_mask", IX86_BUILTIN_MAXPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask, "__builtin_ia32_maxph512_mask", IX86_BUILTIN_MAXPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv8hf3_mask, "__builtin_ia32_minph128_mask", IX86_BUILTIN_MINPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv16hf3_mask, "__builtin_ia32_minph256_mask", IX86_BUILTIN_MINPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask, "__builtin_ia32_minph512_mask", IX86_BUILTIN_MINPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask, "__builtin_ia32_maxsh_mask", IX86_BUILTIN_MAXSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask, "__builtin_ia32_minsh_mask", IX86_BUILTIN_MINSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_cmpv8hf3_mask, "__builtin_ia32_cmpph128_mask", IX86_BUILTIN_CMPPH128_MASK, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_cmpv16hf3_mask, "__builtin_ia32_cmpph256_mask", IX86_BUILTIN_CMPPH256_MASK, UNKNOWN, (int) UHI_FTYPE_V16HF_V16HF_INT_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask, "__builtin_ia32_cmpph512_mask", IX86_BUILTIN_CMPPH512_MASK, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_sqrtv8hf2_mask, "__builtin_ia32_sqrtph128_mask", IX86_BUILTIN_SQRTPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_sqrtv16hf2_mask, "__builtin_ia32_sqrtph256_mask", IX86_BUILTIN_SQRTPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rsqrtv8hf2_mask, "__builtin_ia32_rsqrtph128_mask", IX86_BUILTIN_RSQRTPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rsqrtv16hf2_mask, "__builtin_ia32_rsqrtph256_mask", IX86_BUILTIN_RSQRTPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rsqrtv32hf2_mask, "__builtin_ia32_rsqrtph512_mask", IX86_BUILTIN_RSQRTPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmrsqrtv8hf2_mask, "__builtin_ia32_rsqrtsh_mask", IX86_BUILTIN_RSQRTSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rcpv8hf2_mask, "__builtin_ia32_rcpph128_mask", IX86_BUILTIN_RCPPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rcpv16hf2_mask, "__builtin_ia32_rcpph256_mask", IX86_BUILTIN_RCPPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rcpv32hf2_mask, "__builtin_ia32_rcpph512_mask", IX86_BUILTIN_RCPPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmrcpv8hf2_mask, "__builtin_ia32_rcpsh_mask", IX86_BUILTIN_RCPSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_scalefv8hf_mask, "__builtin_ia32_scalefph128_mask", IX86_BUILTIN_SCALEFPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_scalefv16hf_mask, "__builtin_ia32_scalefph256_mask", IX86_BUILTIN_SCALEFPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducepv8hf_mask, "__builtin_ia32_reduceph128_mask", IX86_BUILTIN_REDUCEPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducepv16hf_mask, "__builtin_ia32_reduceph256_mask", IX86_BUILTIN_REDUCEPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf_mask, "__builtin_ia32_rndscaleph128_mask", IX86_BUILTIN_RNDSCALEPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf_mask, "__builtin_ia32_rndscaleph256_mask", IX86_BUILTIN_RNDSCALEPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_fpclassv16hf_mask, "__builtin_ia32_fpclassph256_mask", IX86_BUILTIN_FPCLASSPH256, UNKNOWN, (int) HI_FTYPE_V16HF_INT_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_fpclassv8hf_mask, "__builtin_ia32_fpclassph128_mask", IX86_BUILTIN_FPCLASSPH128, UNKNOWN, (int) QI_FTYPE_V8HF_INT_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_fpclassv32hf_mask, "__builtin_ia32_fpclassph512_mask", IX86_BUILTIN_FPCLASSPH512, UNKNOWN, (int) SI_FTYPE_V32HF_INT_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_vmfpclassv8hf_mask, "__builtin_ia32_fpclasssh_mask", IX86_BUILTIN_FPCLASSSH_MASK, UNKNOWN, (int) QI_FTYPE_V8HF_INT_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_getexpv16hf_mask, "__builtin_ia32_getexpph256_mask", IX86_BUILTIN_GETEXPPH256, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_getexpv8hf_mask, "__builtin_ia32_getexpph128_mask", IX86_BUILTIN_GETEXPPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_getmantv16hf_mask, "__builtin_ia32_getmantph256_mask", IX86_BUILTIN_GETMANTPH256, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_getmantv8hf_mask, "__builtin_ia32_getmantph128_mask", IX86_BUILTIN_GETMANTPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_movhf_mask, "__builtin_ia32_vmovsh_mask", IX86_BUILTIN_VMOVSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2dq_v4si_mask, "__builtin_ia32_vcvtph2dq128_mask", IX86_BUILTIN_VCVTPH2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2dq_v8si_mask, "__builtin_ia32_vcvtph2dq256_mask", IX86_BUILTIN_VCVTPH2DQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2udq_v4si_mask, "__builtin_ia32_vcvtph2udq128_mask", IX86_BUILTIN_VCVTPH2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2udq_v8si_mask, "__builtin_ia32_vcvtph2udq256_mask", IX86_BUILTIN_VCVTPH2UDQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv4si2_mask, "__builtin_ia32_vcvttph2dq128_mask", IX86_BUILTIN_VCVTTPH2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv8si2_mask, "__builtin_ia32_vcvttph2dq256_mask", IX86_BUILTIN_VCVTTPH2DQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv4si2_mask, "__builtin_ia32_vcvttph2udq128_mask", IX86_BUILTIN_VCVTTPH2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv8si2_mask, "__builtin_ia32_vcvttph2udq256_mask", IX86_BUILTIN_VCVTTPH2UDQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2qq_v2di_mask, "__builtin_ia32_vcvtph2qq128_mask", IX86_BUILTIN_VCVTPH2QQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2qq_v4di_mask, "__builtin_ia32_vcvtph2qq256_mask", IX86_BUILTIN_VCVTPH2QQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uqq_v2di_mask, "__builtin_ia32_vcvtph2uqq128_mask", IX86_BUILTIN_VCVTPH2UQQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uqq_v4di_mask, "__builtin_ia32_vcvtph2uqq256_mask", IX86_BUILTIN_VCVTPH2UQQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv2di2_mask, "__builtin_ia32_vcvttph2qq128_mask", IX86_BUILTIN_VCVTTPH2QQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv4di2_mask, "__builtin_ia32_vcvttph2qq256_mask", IX86_BUILTIN_VCVTTPH2QQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv2di2_mask, "__builtin_ia32_vcvttph2uqq128_mask", IX86_BUILTIN_VCVTTPH2UQQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv4di2_mask, "__builtin_ia32_vcvttph2uqq256_mask", IX86_BUILTIN_VCVTTPH2UQQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2w_v8hi_mask, "__builtin_ia32_vcvtph2w128_mask", IX86_BUILTIN_VCVTPH2W128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2w_v16hi_mask, "__builtin_ia32_vcvtph2w256_mask", IX86_BUILTIN_VCVTPH2W256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uw_v8hi_mask, "__builtin_ia32_vcvtph2uw128_mask", IX86_BUILTIN_VCVTPH2UW128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uw_v16hi_mask, "__builtin_ia32_vcvtph2uw256_mask", IX86_BUILTIN_VCVTPH2UW256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv8hi2_mask, "__builtin_ia32_vcvttph2w128_mask", IX86_BUILTIN_VCVTTPH2W128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv16hi2_mask, "__builtin_ia32_vcvttph2w256_mask", IX86_BUILTIN_VCVTTPH2W256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv8hi2_mask, "__builtin_ia32_vcvttph2uw128_mask", IX86_BUILTIN_VCVTTPH2UW128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv16hi2_mask, "__builtin_ia32_vcvttph2uw256_mask", IX86_BUILTIN_VCVTTPH2UW256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtw2ph_v8hi_mask, "__builtin_ia32_vcvtw2ph128_mask", IX86_BUILTIN_VCVTW2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtw2ph_v16hi_mask, "__builtin_ia32_vcvtw2ph256_mask", IX86_BUILTIN_VCVTW2PH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HI_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuw2ph_v8hi_mask, "__builtin_ia32_vcvtuw2ph128_mask", IX86_BUILTIN_VCVTUW2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuw2ph_v16hi_mask, "__builtin_ia32_vcvtuw2ph256_mask", IX86_BUILTIN_VCVTUW2PH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HI_V16HF_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtdq2ph_v4si_mask, "__builtin_ia32_vcvtdq2ph128_mask", IX86_BUILTIN_VCVTDQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V4SI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtdq2ph_v8si_mask, "__builtin_ia32_vcvtdq2ph256_mask", IX86_BUILTIN_VCVTDQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V8SI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtudq2ph_v4si_mask, "__builtin_ia32_vcvtudq2ph128_mask", IX86_BUILTIN_VCVTUDQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V4SI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtudq2ph_v8si_mask, "__builtin_ia32_vcvtudq2ph256_mask", IX86_BUILTIN_VCVTUDQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V8SI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtqq2ph_v2di_mask, "__builtin_ia32_vcvtqq2ph128_mask", IX86_BUILTIN_VCVTQQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V2DI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtqq2ph_v4di_mask, "__builtin_ia32_vcvtqq2ph256_mask", IX86_BUILTIN_VCVTQQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V4DI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuqq2ph_v2di_mask, "__builtin_ia32_vcvtuqq2ph128_mask", IX86_BUILTIN_VCVTUQQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V2DI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuqq2ph_v4di_mask, "__builtin_ia32_vcvtuqq2ph256_mask", IX86_BUILTIN_VCVTUQQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V4DI_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv2df2_mask, "__builtin_ia32_vcvtph2pd128_mask", IX86_BUILTIN_VCVTPH2PD128_MASK, UNKNOWN, (int) V2DF_FTYPE_V8HF_V2DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv4df2_mask, "__builtin_ia32_vcvtph2pd256_mask", IX86_BUILTIN_VCVTPH2PD256_MASK, UNKNOWN, (int) V4DF_FTYPE_V8HF_V4DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv4sf2_mask, "__builtin_ia32_vcvtph2psx128_mask", IX86_BUILTIN_VCVTPH2PSX128_MASK, UNKNOWN, (int) V4SF_FTYPE_V8HF_V4SF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv8sf2_mask, "__builtin_ia32_vcvtph2psx256_mask", IX86_BUILTIN_VCVTPH2PSX256_MASK, UNKNOWN, (int) V8SF_FTYPE_V8HF_V8SF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtps2ph_v4sf_mask, "__builtin_ia32_vcvtps2phx128_mask", IX86_BUILTIN_VCVTPS2PHX128_MASK, UNKNOWN, (int) V8HF_FTYPE_V4SF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtps2ph_v8sf_mask, "__builtin_ia32_vcvtps2phx256_mask", IX86_BUILTIN_VCVTPS2PHX256_MASK, UNKNOWN, (int) V8HF_FTYPE_V8SF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtpd2ph_v2df_mask, "__builtin_ia32_vcvtpd2ph128_mask", IX86_BUILTIN_VCVTPD2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V2DF_V8HF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtpd2ph_v4df_mask, "__builtin_ia32_vcvtpd2ph256_mask", IX86_BUILTIN_VCVTPD2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V4DF_V8HF_UQI)
/* Builtins with rounding support. */
BDESC_END (ARGS, ROUND_ARGS)
@@ -3003,20 +3076,70 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv16sf_mask_round, "_
BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv8df_mask_round, "__builtin_ia32_rangepd512_mask", IX86_BUILTIN_RANGEPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT)
/* AVX512FP16. */
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_vaddph_v32hf_mask_round", IX86_BUILTIN_VADDPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_vsubph_v32hf_mask_round", IX86_BUILTIN_VSUBPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_vmulph_v32hf_mask_round", IX86_BUILTIN_VMULPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_vdivph_v32hf_mask_round", IX86_BUILTIN_VDIVPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask_round, "__builtin_ia32_vaddsh_v8hf_mask_round", IX86_BUILTIN_VADDSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask_round, "__builtin_ia32_vsubsh_v8hf_mask_round", IX86_BUILTIN_VSUBSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask_round, "__builtin_ia32_vmulsh_v8hf_mask_round", IX86_BUILTIN_VMULSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask_round, "__builtin_ia32_vdivsh_v8hf_mask_round", IX86_BUILTIN_VDIVSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask_round, "__builtin_ia32_vmaxph_v32hf_mask_round", IX86_BUILTIN_VMAXPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask_round, "__builtin_ia32_vminph_v32hf_mask_round", IX86_BUILTIN_VMINPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask_round, "__builtin_ia32_vmaxsh_v8hf_mask_round", IX86_BUILTIN_VMAXSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask_round, "__builtin_ia32_vminsh_v8hf_mask_round", IX86_BUILTIN_VMINSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask_round, "__builtin_ia32_vcmpph_v32hf_mask_round", IX86_BUILTIN_VCMPPH_V32HF_MASK_ROUND, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI_INT)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmcmpv8hf3_mask_round, "__builtin_ia32_vcmpsh_v8hf_mask_round", IX86_BUILTIN_VCMPSH_V8HF_MASK_ROUND, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_addph512_mask_round", IX86_BUILTIN_ADDPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_subph512_mask_round", IX86_BUILTIN_SUBPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_mulph512_mask_round", IX86_BUILTIN_MULPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_divph512_mask_round", IX86_BUILTIN_DIVPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask_round, "__builtin_ia32_addsh_mask_round", IX86_BUILTIN_ADDSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask_round, "__builtin_ia32_subsh_mask_round", IX86_BUILTIN_SUBSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask_round, "__builtin_ia32_mulsh_mask_round", IX86_BUILTIN_MULSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask_round, "__builtin_ia32_divsh_mask_round", IX86_BUILTIN_DIVSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask_round, "__builtin_ia32_maxph512_mask_round", IX86_BUILTIN_MAXPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask_round, "__builtin_ia32_minph512_mask_round", IX86_BUILTIN_MINPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask_round, "__builtin_ia32_maxsh_mask_round", IX86_BUILTIN_MAXSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask_round, "__builtin_ia32_minsh_mask_round", IX86_BUILTIN_MINSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask_round, "__builtin_ia32_cmpph512_mask_round", IX86_BUILTIN_CMPPH512_MASK_ROUND, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmcmpv8hf3_mask_round, "__builtin_ia32_cmpsh_mask_round", IX86_BUILTIN_CMPSH_MASK_ROUND, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_sqrtv32hf2_mask_round, "__builtin_ia32_sqrtph512_mask_round", IX86_BUILTIN_SQRTPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsqrtv8hf2_mask_round, "__builtin_ia32_sqrtsh_mask_round", IX86_BUILTIN_SQRTSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_scalefv32hf_mask_round, "__builtin_ia32_scalefph512_mask_round", IX86_BUILTIN_SCALEFPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmscalefv8hf_mask_round, "__builtin_ia32_scalefsh_mask_round", IX86_BUILTIN_SCALEFSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducepv32hf_mask_round, "__builtin_ia32_reduceph512_mask_round", IX86_BUILTIN_REDUCEPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_INT_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducesv8hf_mask_round, "__builtin_ia32_reducesh_mask_round", IX86_BUILTIN_REDUCESH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf_mask_round, "__builtin_ia32_rndscaleph512_mask_round", IX86_BUILTIN_RNDSCALEPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_INT_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_rndscalev8hf_mask_round, "__builtin_ia32_rndscalesh_mask_round", IX86_BUILTIN_RNDSCALESH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_getexpv32hf_mask_round, "__builtin_ia32_getexpph512_mask", IX86_BUILTIN_GETEXPPH512, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_sgetexpv8hf_mask_round, "__builtin_ia32_getexpsh_mask_round", IX86_BUILTIN_GETEXPSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_getmantv32hf_mask_round, "__builtin_ia32_getmantph512_mask", IX86_BUILTIN_GETMANTPH512, UNKNOWN, (int) V32HF_FTYPE_V32HF_INT_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vgetmantv8hf_mask_round, "__builtin_ia32_getmantsh_mask_round", IX86_BUILTIN_GETMANTSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2dq_v16si_mask_round, "__builtin_ia32_vcvtph2dq512_mask_round", IX86_BUILTIN_VCVTPH2DQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2udq_v16si_mask_round, "__builtin_ia32_vcvtph2udq512_mask_round", IX86_BUILTIN_VCVTPH2UDQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv16si2_mask_round, "__builtin_ia32_vcvttph2dq512_mask_round", IX86_BUILTIN_VCVTTPH2DQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv16si2_mask_round, "__builtin_ia32_vcvttph2udq512_mask_round", IX86_BUILTIN_VCVTTPH2UDQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2qq_v8di_mask_round, "__builtin_ia32_vcvtph2qq512_mask_round", IX86_BUILTIN_VCVTPH2QQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uqq_v8di_mask_round, "__builtin_ia32_vcvtph2uqq512_mask_round", IX86_BUILTIN_VCVTPH2UQQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv8di2_mask_round, "__builtin_ia32_vcvttph2qq512_mask_round", IX86_BUILTIN_VCVTTPH2QQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv8di2_mask_round, "__builtin_ia32_vcvttph2uqq512_mask_round", IX86_BUILTIN_VCVTTPH2UQQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2w_v32hi_mask_round, "__builtin_ia32_vcvtph2w512_mask_round", IX86_BUILTIN_VCVTPH2W512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uw_v32hi_mask_round, "__builtin_ia32_vcvtph2uw512_mask_round", IX86_BUILTIN_VCVTPH2UW512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv32hi2_mask_round, "__builtin_ia32_vcvttph2w512_mask_round", IX86_BUILTIN_VCVTTPH2W512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv32hi2_mask_round, "__builtin_ia32_vcvttph2uw512_mask_round", IX86_BUILTIN_VCVTTPH2UW512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtw2ph_v32hi_mask_round, "__builtin_ia32_vcvtw2ph512_mask_round", IX86_BUILTIN_VCVTW2PH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HI_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuw2ph_v32hi_mask_round, "__builtin_ia32_vcvtuw2ph512_mask_round", IX86_BUILTIN_VCVTUW2PH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HI_V32HF_USI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtdq2ph_v16si_mask_round, "__builtin_ia32_vcvtdq2ph512_mask_round", IX86_BUILTIN_VCVTDQ2PH512_MASK_ROUND, UNKNOWN, (int) V16HF_FTYPE_V16SI_V16HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtudq2ph_v16si_mask_round, "__builtin_ia32_vcvtudq2ph512_mask_round", IX86_BUILTIN_VCVTUDQ2PH512_MASK_ROUND, UNKNOWN, (int) V16HF_FTYPE_V16SI_V16HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtqq2ph_v8di_mask_round, "__builtin_ia32_vcvtqq2ph512_mask_round", IX86_BUILTIN_VCVTQQ2PH512_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8DI_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuqq2ph_v8di_mask_round, "__builtin_ia32_vcvtuqq2ph512_mask_round", IX86_BUILTIN_VCVTUQQ2PH512_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8DI_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2si_round, "__builtin_ia32_vcvtsh2si32_round", IX86_BUILTIN_VCVTSH2SI32_ROUND, UNKNOWN, (int) INT_FTYPE_V8HF_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2siq_round, "__builtin_ia32_vcvtsh2si64_round", IX86_BUILTIN_VCVTSH2SI64_ROUND, UNKNOWN, (int) INT64_FTYPE_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2usi_round, "__builtin_ia32_vcvtsh2usi32_round", IX86_BUILTIN_VCVTSH2USI32_ROUND, UNKNOWN, (int) UINT_FTYPE_V8HF_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2usiq_round, "__builtin_ia32_vcvtsh2usi64_round", IX86_BUILTIN_VCVTSH2USI64_ROUND, UNKNOWN, (int) UINT64_FTYPE_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncsi2_round, "__builtin_ia32_vcvttsh2si32_round", IX86_BUILTIN_VCVTTSH2SI32_ROUND, UNKNOWN, (int) INT_FTYPE_V8HF_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncdi2_round, "__builtin_ia32_vcvttsh2si64_round", IX86_BUILTIN_VCVTTSH2SI64_ROUND, UNKNOWN, (int) INT64_FTYPE_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncsi2_round, "__builtin_ia32_vcvttsh2usi32_round", IX86_BUILTIN_VCVTTSH2USI32_ROUND, UNKNOWN, (int) UINT_FTYPE_V8HF_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncdi2_round, "__builtin_ia32_vcvttsh2usi64_round", IX86_BUILTIN_VCVTTSH2USI64_ROUND, UNKNOWN, (int) UINT64_FTYPE_V8HF_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsi2sh_round, "__builtin_ia32_vcvtsi2sh32_round", IX86_BUILTIN_VCVTSI2SH32_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsi2shq_round, "__builtin_ia32_vcvtsi2sh64_round", IX86_BUILTIN_VCVTSI2SH64_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT64_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtusi2sh_round, "__builtin_ia32_vcvtusi2sh32_round", IX86_BUILTIN_VCVTUSI2SH32_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_UINT_INT)
+BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtusi2shq_round, "__builtin_ia32_vcvtusi2sh64_round", IX86_BUILTIN_VCVTUSI2SH64_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_UINT64_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv8df2_mask_round, "__builtin_ia32_vcvtph2pd512_mask_round", IX86_BUILTIN_VCVTPH2PD512_MASK_ROUND, UNKNOWN, (int) V8DF_FTYPE_V8HF_V8DF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv16sf2_mask_round, "__builtin_ia32_vcvtph2psx512_mask_round", IX86_BUILTIN_VCVTPH2PSX512_MASK_ROUND, UNKNOWN, (int) V16SF_FTYPE_V16HF_V16SF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtpd2ph_v8df_mask_round, "__builtin_ia32_vcvtpd2ph512_mask_round", IX86_BUILTIN_VCVTPD2PH512_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8DF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtps2ph_v16sf_mask_round, "__builtin_ia32_vcvtps2phx512_mask_round", IX86_BUILTIN_VCVTPS2PHX512_MASK_ROUND, UNKNOWN, (int) V16HF_FTYPE_V16SF_V16HF_UHI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2ss_mask_round, "__builtin_ia32_vcvtsh2ss_mask_round", IX86_BUILTIN_VCVTSH2SS_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2sd_mask_round, "__builtin_ia32_vcvtsh2sd_mask_round", IX86_BUILTIN_VCVTSH2SD_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtss2sh_mask_round, "__builtin_ia32_vcvtss2sh_mask_round", IX86_BUILTIN_VCVTSS2SH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsd2sh_mask_round, "__builtin_ia32_vcvtsd2sh_mask_round", IX86_BUILTIN_VCVTSD2SH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT)
BDESC_END (ROUND_ARGS, MULTI_ARG)
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index e117afb..bfafd15 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -9710,6 +9710,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V16HI_FTYPE_V16SI_V16HI_UHI:
case V16QI_FTYPE_V16SI_V16QI_UHI:
case V16QI_FTYPE_V8DI_V16QI_UQI:
+ case V32HF_FTYPE_V32HF_V32HF_USI:
case V16SF_FTYPE_V16SF_V16SF_UHI:
case V16SF_FTYPE_V4SF_V16SF_UHI:
case V16SI_FTYPE_SI_V16SI_UHI:
@@ -9739,20 +9740,40 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V16HI_FTYPE_HI_V16HI_UHI:
case V8HI_FTYPE_V8HI_V8HI_UQI:
case V8HI_FTYPE_HI_V8HI_UQI:
+ case V16HF_FTYPE_V16HF_V16HF_UHI:
case V8SF_FTYPE_V8HI_V8SF_UQI:
case V4SF_FTYPE_V8HI_V4SF_UQI:
+ case V8SI_FTYPE_V8HF_V8SI_UQI:
+ case V8SF_FTYPE_V8HF_V8SF_UQI:
case V8SI_FTYPE_V8SF_V8SI_UQI:
case V4SI_FTYPE_V4SF_V4SI_UQI:
+ case V4SI_FTYPE_V8HF_V4SI_UQI:
+ case V4SF_FTYPE_V8HF_V4SF_UQI:
+ case V4DI_FTYPE_V8HF_V4DI_UQI:
case V4DI_FTYPE_V4SF_V4DI_UQI:
+ case V2DI_FTYPE_V8HF_V2DI_UQI:
case V2DI_FTYPE_V4SF_V2DI_UQI:
+ case V8HF_FTYPE_V8HF_V8HF_UQI:
+ case V8HF_FTYPE_V8HI_V8HF_UQI:
+ case V8HF_FTYPE_V8SI_V8HF_UQI:
+ case V8HF_FTYPE_V8SF_V8HF_UQI:
+ case V8HF_FTYPE_V4SI_V8HF_UQI:
+ case V8HF_FTYPE_V4SF_V8HF_UQI:
+ case V8HF_FTYPE_V4DI_V8HF_UQI:
+ case V8HF_FTYPE_V4DF_V8HF_UQI:
+ case V8HF_FTYPE_V2DI_V8HF_UQI:
+ case V8HF_FTYPE_V2DF_V8HF_UQI:
case V4SF_FTYPE_V4DI_V4SF_UQI:
case V4SF_FTYPE_V2DI_V4SF_UQI:
case V4DF_FTYPE_V4DI_V4DF_UQI:
+ case V4DF_FTYPE_V8HF_V4DF_UQI:
+ case V2DF_FTYPE_V8HF_V2DF_UQI:
case V2DF_FTYPE_V2DI_V2DF_UQI:
case V16QI_FTYPE_V8HI_V16QI_UQI:
case V16QI_FTYPE_V16HI_V16QI_UHI:
case V16QI_FTYPE_V4SI_V16QI_UQI:
case V16QI_FTYPE_V8SI_V16QI_UQI:
+ case V8HI_FTYPE_V8HF_V8HI_UQI:
case V8HI_FTYPE_V4SI_V8HI_UQI:
case V8HI_FTYPE_V8SI_V8HI_UQI:
case V16QI_FTYPE_V2DI_V16QI_UQI:
@@ -9810,6 +9831,8 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V8DI_FTYPE_DI_V8DI_UQI:
case V16SF_FTYPE_V8SF_V16SF_UHI:
case V16SI_FTYPE_V8SI_V16SI_UHI:
+ case V16HF_FTYPE_V16HI_V16HF_UHI:
+ case V16HI_FTYPE_V16HF_V16HI_UHI:
case V16HI_FTYPE_V16HI_V16HI_UHI:
case V8HI_FTYPE_V16QI_V8HI_UQI:
case V16HI_FTYPE_V16QI_V16HI_UHI:
@@ -9910,6 +9933,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case HI_FTYPE_V16SF_INT_UHI:
case QI_FTYPE_V8SF_INT_UQI:
case QI_FTYPE_V4SF_INT_UQI:
+ case QI_FTYPE_V8HF_INT_UQI:
+ case HI_FTYPE_V16HF_INT_UHI:
+ case SI_FTYPE_V32HF_INT_USI:
case V4SI_FTYPE_V4SI_V4SI_UHI:
case V8SI_FTYPE_V8SI_V8SI_UHI:
nargs = 3;
@@ -10058,6 +10084,8 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
+ case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
+ case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
@@ -10229,8 +10257,10 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case CODE_FOR_avx_vpermilv4df_mask:
case CODE_FOR_avx512f_getmantv8df_mask:
case CODE_FOR_avx512f_getmantv16sf_mask:
+ case CODE_FOR_avx512vl_getmantv16hf_mask:
case CODE_FOR_avx512vl_getmantv8sf_mask:
case CODE_FOR_avx512vl_getmantv4df_mask:
+ case CODE_FOR_avx512fp16_getmantv8hf_mask:
case CODE_FOR_avx512vl_getmantv4sf_mask:
case CODE_FOR_avx512vl_getmantv2df_mask:
case CODE_FOR_avx512dq_rangepv8df_mask_round:
@@ -10645,16 +10675,24 @@ ix86_expand_round_builtin (const struct builtin_description *d,
{
case UINT64_FTYPE_V2DF_INT:
case UINT64_FTYPE_V4SF_INT:
+ case UINT64_FTYPE_V8HF_INT:
case UINT_FTYPE_V2DF_INT:
case UINT_FTYPE_V4SF_INT:
+ case UINT_FTYPE_V8HF_INT:
case INT64_FTYPE_V2DF_INT:
case INT64_FTYPE_V4SF_INT:
+ case INT64_FTYPE_V8HF_INT:
case INT_FTYPE_V2DF_INT:
case INT_FTYPE_V4SF_INT:
+ case INT_FTYPE_V8HF_INT:
nargs = 2;
break;
case V32HF_FTYPE_V32HF_V32HF_INT:
case V8HF_FTYPE_V8HF_V8HF_INT:
+ case V8HF_FTYPE_V8HF_INT_INT:
+ case V8HF_FTYPE_V8HF_UINT_INT:
+ case V8HF_FTYPE_V8HF_INT64_INT:
+ case V8HF_FTYPE_V8HF_UINT64_INT:
case V4SF_FTYPE_V4SF_UINT_INT:
case V4SF_FTYPE_V4SF_UINT64_INT:
case V2DF_FTYPE_V2DF_UINT64_INT:
@@ -10669,18 +10707,29 @@ ix86_expand_round_builtin (const struct builtin_description *d,
break;
case V8SF_FTYPE_V8DF_V8SF_QI_INT:
case V8DF_FTYPE_V8DF_V8DF_QI_INT:
+ case V32HI_FTYPE_V32HF_V32HI_USI_INT:
case V8SI_FTYPE_V8DF_V8SI_QI_INT:
+ case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
case V8DI_FTYPE_V8DF_V8DI_QI_INT:
case V8SF_FTYPE_V8DI_V8SF_QI_INT:
case V8DF_FTYPE_V8DI_V8DF_QI_INT:
+ case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
+ case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
+ case V32HF_FTYPE_V32HI_V32HF_USI_INT:
+ case V32HF_FTYPE_V32HF_V32HF_USI_INT:
case V16SF_FTYPE_V16SF_V16SF_HI_INT:
case V8DI_FTYPE_V8SF_V8DI_QI_INT:
case V16SF_FTYPE_V16SI_V16SF_HI_INT:
case V16SI_FTYPE_V16SF_V16SI_HI_INT:
+ case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
+ case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
case V8DF_FTYPE_V8SF_V8DF_QI_INT:
case V16SF_FTYPE_V16HI_V16SF_HI_INT:
case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
+ case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
+ case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
+ case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
nargs = 4;
break;
case V4SF_FTYPE_V4SF_V4SF_INT_INT:
@@ -10694,8 +10743,10 @@ ix86_expand_round_builtin (const struct builtin_description *d,
case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
+ case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
+ case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
@@ -10703,8 +10754,11 @@ ix86_expand_round_builtin (const struct builtin_description *d,
case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
+ case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
+ case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
nargs = 5;
break;
+ case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
@@ -10727,6 +10781,7 @@ ix86_expand_round_builtin (const struct builtin_description *d,
case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
+ case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
nargs = 6;
nargs_constant = 4;
break;
@@ -10763,10 +10818,12 @@ ix86_expand_round_builtin (const struct builtin_description *d,
{
case CODE_FOR_avx512f_getmantv8df_mask_round:
case CODE_FOR_avx512f_getmantv16sf_mask_round:
+ case CODE_FOR_avx512bw_getmantv32hf_mask_round:
case CODE_FOR_avx512f_vgetmantv2df_round:
case CODE_FOR_avx512f_vgetmantv2df_mask_round:
case CODE_FOR_avx512f_vgetmantv4sf_round:
case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
+ case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
error ("the immediate argument must be a 4-bit immediate");
return const0_rtx;
case CODE_FOR_avx512f_cmpv8df3_mask_round:
@@ -11070,6 +11127,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
case VOID_FTYPE_PFLOAT_V16SF_UHI:
case VOID_FTYPE_PFLOAT_V8SF_UQI:
case VOID_FTYPE_PFLOAT_V4SF_UQI:
+ case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
case VOID_FTYPE_PV32QI_V32HI_USI:
case VOID_FTYPE_PV16QI_V16HI_UHI:
case VOID_FTYPE_PUDI_V8HI_UQI:
@@ -11142,6 +11200,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
+ case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
nargs = 3;
klass = load;
memory = 0;
@@ -14054,7 +14113,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
tmp1 = gen_reg_rtx (SImode);
emit_move_insn (tmp1, gen_lowpart (SImode, val));
- /* Insert the SImode value as low element of a V4SImode vector. */
+ /* Insert the SImode value as low element of a V4SImode vector. */
tmp2 = gen_reg_rtx (V4SImode);
emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
@@ -14179,6 +14238,8 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
break;
case E_V8HImode:
use_vector_set = TARGET_SSE2;
+ gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
+ ? gen_vec_setv8hi_0 : NULL;
break;
case E_V8QImode:
use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
@@ -14190,8 +14251,12 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
use_vector_set = TARGET_SSE4_1;
break;
case E_V32QImode:
+ use_vector_set = TARGET_AVX;
+ break;
case E_V16HImode:
use_vector_set = TARGET_AVX;
+ gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
+ ? gen_vec_setv16hi_0 : NULL;
break;
case E_V8SImode:
use_vector_set = TARGET_AVX;
@@ -14239,6 +14304,9 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
use_vector_set = TARGET_AVX512FP16 && one_var == 0;
gen_vec_set_0 = gen_vec_setv32hf_0;
break;
+ case E_V32HImode:
+ use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv32hi_0;
default:
break;
}
@@ -14638,7 +14706,7 @@ ix86_expand_vector_init_interleave (machine_mode mode,
switch (mode)
{
case E_V8HFmode:
- gen_load_even = gen_vec_setv8hf;
+ gen_load_even = gen_vec_interleave_lowv8hf;
gen_interleave_first_low = gen_vec_interleave_lowv4si;
gen_interleave_second_low = gen_vec_interleave_lowv2di;
inner_mode = HFmode;
@@ -14673,35 +14741,40 @@ ix86_expand_vector_init_interleave (machine_mode mode,
op = ops [i + i];
if (inner_mode == HFmode)
{
- /* Convert HFmode to HImode. */
- op1 = gen_reg_rtx (HImode);
- op1 = gen_rtx_SUBREG (HImode, force_reg (HFmode, op), 0);
- op = gen_reg_rtx (HImode);
- emit_move_insn (op, op1);
+ rtx even, odd;
+ /* Use vpuncklwd to pack 2 HFmode. */
+ op0 = gen_reg_rtx (V8HFmode);
+ even = lowpart_subreg (V8HFmode, force_reg (HFmode, op), HFmode);
+ odd = lowpart_subreg (V8HFmode,
+ force_reg (HFmode, ops[i + i + 1]),
+ HFmode);
+ emit_insn (gen_load_even (op0, even, odd));
}
+ else
+ {
+ /* Extend the odd elment to SImode using a paradoxical SUBREG. */
+ op0 = gen_reg_rtx (SImode);
+ emit_move_insn (op0, gen_lowpart (SImode, op));
- /* Extend the odd elment to SImode using a paradoxical SUBREG. */
- op0 = gen_reg_rtx (SImode);
- emit_move_insn (op0, gen_lowpart (SImode, op));
-
- /* Insert the SImode value as low element of V4SImode vector. */
- op1 = gen_reg_rtx (V4SImode);
- op0 = gen_rtx_VEC_MERGE (V4SImode,
- gen_rtx_VEC_DUPLICATE (V4SImode,
- op0),
- CONST0_RTX (V4SImode),
- const1_rtx);
- emit_insn (gen_rtx_SET (op1, op0));
+ /* Insert the SImode value as low element of V4SImode vector. */
+ op1 = gen_reg_rtx (V4SImode);
+ op0 = gen_rtx_VEC_MERGE (V4SImode,
+ gen_rtx_VEC_DUPLICATE (V4SImode,
+ op0),
+ CONST0_RTX (V4SImode),
+ const1_rtx);
+ emit_insn (gen_rtx_SET (op1, op0));
- /* Cast the V4SImode vector back to a vector in orignal mode. */
- op0 = gen_reg_rtx (mode);
- emit_move_insn (op0, gen_lowpart (mode, op1));
+ /* Cast the V4SImode vector back to a vector in orignal mode. */
+ op0 = gen_reg_rtx (mode);
+ emit_move_insn (op0, gen_lowpart (mode, op1));
- /* Load even elements into the second position. */
- emit_insn (gen_load_even (op0,
- force_reg (inner_mode,
- ops [i + i + 1]),
- const1_rtx));
+ /* Load even elements into the second position. */
+ emit_insn (gen_load_even (op0,
+ force_reg (inner_mode,
+ ops[i + i + 1]),
+ const1_rtx));
+ }
/* Cast vector to FIRST_IMODE vector. */
ops[i] = gen_reg_rtx (first_imode);
@@ -15182,6 +15255,7 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
machine_mode inner_mode = GET_MODE_INNER (mode);
machine_mode half_mode;
bool use_vec_merge = false;
+ bool blendm_const = false;
rtx tmp;
static rtx (*gen_extract[7][2]) (rtx, rtx)
= {
@@ -15369,7 +15443,14 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
return;
case E_V8HFmode:
- use_vec_merge = true;
+ if (TARGET_AVX2)
+ {
+ mmode = SImode;
+ gen_blendm = gen_sse4_1_pblendph;
+ blendm_const = true;
+ }
+ else
+ use_vec_merge = true;
break;
case E_V8HImode:
@@ -15396,10 +15477,20 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
goto half;
case E_V16HFmode:
- half_mode = V8HFmode;
- j = 6;
- n = 8;
- goto half;
+ if (TARGET_AVX2)
+ {
+ mmode = SImode;
+ gen_blendm = gen_avx2_pblendph;
+ blendm_const = true;
+ break;
+ }
+ else
+ {
+ half_mode = V8HFmode;
+ j = 6;
+ n = 8;
+ goto half;
+ }
case E_V16HImode:
half_mode = V8HImode;
@@ -15560,15 +15651,15 @@ quarter:
{
tmp = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
+ rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
/* The avx512*_blendm<mode> expanders have different operand order
from VEC_MERGE. In VEC_MERGE, the first input operand is used for
elements where the mask is set and second input operand otherwise,
in {sse,avx}*_*blend* the first input operand is used for elements
where the mask is clear and second input operand otherwise. */
- emit_insn (gen_blendm (target, target, tmp,
- force_reg (mmode,
- gen_int_mode (HOST_WIDE_INT_1U << elt,
- mmode))));
+ if (!blendm_const)
+ merge_mask = force_reg (mmode, merge_mask);
+ emit_insn (gen_blendm (target, target, tmp, merge_mask));
}
else if (use_vec_merge)
{
diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 5a99ea7..a525a83 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -2210,15 +2210,34 @@ remove_partial_avx_dependency (void)
!= AVX_PARTIAL_XMM_UPDATE_TRUE)
continue;
- if (!v4sf_const0)
- v4sf_const0 = gen_reg_rtx (V4SFmode);
-
/* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
vec_merge with subreg. */
rtx src = SET_SRC (set);
rtx dest = SET_DEST (set);
machine_mode dest_mode = GET_MODE (dest);
+ machine_mode src_mode = GET_MODE (XEXP (src, 0));
+
+ switch (src_mode)
+ {
+ case E_SFmode:
+ case E_DFmode:
+ if (TARGET_USE_VECTOR_FP_CONVERTS
+ || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
+ continue;
+ break;
+ case E_SImode:
+ case E_DImode:
+ if (TARGET_USE_VECTOR_CONVERTS
+ || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ if (!v4sf_const0)
+ v4sf_const0 = gen_reg_rtx (V4SFmode);
rtx zero;
machine_mode dest_vecmode;
diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index fcadfcd..2a2c8b8 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -90,6 +90,8 @@ VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */
VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */
VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */
VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */
+VECTOR_MODE (FLOAT, HF, 2); /* V2HF */
+VECTOR_MODE (FLOAT, HF, 6); /* V6HF */
VECTOR_MODE (INT, TI, 1); /* V1TI */
VECTOR_MODE (INT, DI, 1); /* V1DI */
VECTOR_MODE (INT, SI, 1); /* V1SI */
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index c0006b3..e7a3bd4 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -724,7 +724,7 @@ static const struct processor_costs *processor_cost_table[] =
&slm_cost,
&slm_cost,
&slm_cost,
- &slm_cost,
+ &tremont_cost,
&slm_cost,
&slm_cost,
&skylake_cost,
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dcae34b..708834a 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -320,7 +320,7 @@ struct ix86_address
addr_space_t seg;
};
-extern int ix86_decompose_address (rtx, struct ix86_address *);
+extern bool ix86_decompose_address (rtx, struct ix86_address *);
extern int memory_address_length (rtx, bool);
extern void x86_output_aligned_bss (FILE *, tree, const char *,
unsigned HOST_WIDE_INT, unsigned);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7b173bc..afc2674 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -10101,10 +10101,10 @@ ix86_live_on_entry (bitmap regs)
}
/* Extract the parts of an RTL expression that is a valid memory address
- for an instruction. Return 0 if the structure of the address is
+ for an instruction. Return false if the structure of the address is
grossly off. */
-int
+bool
ix86_decompose_address (rtx addr, struct ix86_address *out)
{
rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
@@ -10123,17 +10123,17 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
{
addr = XEXP (addr, 0);
if (CONST_INT_P (addr))
- return 0;
+ return false;
}
else if (GET_CODE (addr) == AND
&& const_32bit_mask (XEXP (addr, 1), DImode))
{
addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
if (addr == NULL_RTX)
- return 0;
+ return false;
if (CONST_INT_P (addr))
- return 0;
+ return false;
}
else if (GET_CODE (addr) == AND)
{
@@ -10167,7 +10167,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
{
addr = SUBREG_REG (addr);
if (CONST_INT_P (addr))
- return 0;
+ return false;
}
}
@@ -10178,7 +10178,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
if (REG_P (SUBREG_REG (addr)))
base = addr;
else
- return 0;
+ return false;
}
else if (GET_CODE (addr) == PLUS)
{
@@ -10189,13 +10189,13 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
do
{
if (n >= 4)
- return 0;
+ return false;
addends[n++] = XEXP (op, 1);
op = XEXP (op, 0);
}
while (GET_CODE (op) == PLUS);
if (n >= 4)
- return 0;
+ return false;
addends[n] = op;
for (i = n; i >= 0; --i)
@@ -10205,28 +10205,28 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
{
case MULT:
if (index)
- return 0;
+ return false;
index = XEXP (op, 0);
scale_rtx = XEXP (op, 1);
break;
case ASHIFT:
if (index)
- return 0;
+ return false;
index = XEXP (op, 0);
tmp = XEXP (op, 1);
if (!CONST_INT_P (tmp))
- return 0;
+ return false;
scale = INTVAL (tmp);
if ((unsigned HOST_WIDE_INT) scale > 3)
- return 0;
+ return false;
scale = 1 << scale;
break;
case ZERO_EXTEND:
op = XEXP (op, 0);
if (GET_CODE (op) != UNSPEC)
- return 0;
+ return false;
/* FALLTHRU */
case UNSPEC:
@@ -10235,12 +10235,12 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
&& seg == ADDR_SPACE_GENERIC)
seg = DEFAULT_TLS_SEG_REG;
else
- return 0;
+ return false;
break;
case SUBREG:
if (!REG_P (SUBREG_REG (op)))
- return 0;
+ return false;
/* FALLTHRU */
case REG:
@@ -10249,7 +10249,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
else if (!index)
index = op;
else
- return 0;
+ return false;
break;
case CONST:
@@ -10257,12 +10257,12 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
case SYMBOL_REF:
case LABEL_REF:
if (disp)
- return 0;
+ return false;
disp = op;
break;
default:
- return 0;
+ return false;
}
}
}
@@ -10277,10 +10277,10 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
index = XEXP (addr, 0);
tmp = XEXP (addr, 1);
if (!CONST_INT_P (tmp))
- return 0;
+ return false;
scale = INTVAL (tmp);
if ((unsigned HOST_WIDE_INT) scale > 3)
- return 0;
+ return false;
scale = 1 << scale;
}
else
@@ -10294,14 +10294,14 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
&& REG_P (SUBREG_REG (index)))
;
else
- return 0;
+ return false;
}
/* Extract the integral value of scale. */
if (scale_rtx)
{
if (!CONST_INT_P (scale_rtx))
- return 0;
+ return false;
scale = INTVAL (scale_rtx);
}
@@ -10354,7 +10354,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
out->scale = scale;
out->seg = seg;
- return 1;
+ return true;
}
/* Return cost of the memory address x.
@@ -16976,6 +16976,7 @@ ix86_sched_init_global (FILE *, int, int)
case PROCESSOR_NEHALEM:
case PROCESSOR_SANDYBRIDGE:
case PROCESSOR_HASWELL:
+ case PROCESSOR_TREMONT:
case PROCESSOR_GENERIC:
/* Do not perform multipass scheduling for pre-reload schedule
to save compile time. */
@@ -19443,8 +19444,11 @@ ix86_can_change_mode_class (machine_mode from, machine_mode to,
/* Vector registers do not support QI or HImode loads. If we don't
disallow a change to these modes, reload will assume it's ok to
drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
- the vec_dupv4hi pattern. */
- if (GET_MODE_SIZE (from) < 4)
+ the vec_dupv4hi pattern.
+ NB: AVX512FP16 supports vmovw which can load 16bit data to sse
+ register. */
+ int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 4;
+ if (GET_MODE_SIZE (from) < mov_size)
return false;
}
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index e76bb55..ec60b89 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -334,6 +334,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_PARTIAL_REG_DEPENDENCY]
#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \
ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY]
+#define TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY \
+ ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY]
+#define TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY \
+ ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY]
#define TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
#define TARGET_SSE_UNALIGNED_STORE_OPTIMAL \
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 13f6f57..c82a9dc 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4535,7 +4535,8 @@
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand")))]
"!TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+ && TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY
+ && epilogue_completed
&& optimize_function_for_speed_p (cfun)
&& (!REG_P (operands[1])
|| (!TARGET_AVX && REGNO (operands[0]) != REGNO (operands[1])))
@@ -4708,7 +4709,8 @@
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand")))]
"!TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+ && TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY
+ && epilogue_completed
&& optimize_function_for_speed_p (cfun)
&& (!REG_P (operands[1])
|| (!TARGET_AVX && REGNO (operands[0]) != REGNO (operands[1])))
@@ -5243,7 +5245,8 @@
[(set (match_operand:MODEF 0 "sse_reg_operand")
(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
"!TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+ && TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY
+ && epilogue_completed
&& optimize_function_for_speed_p (cfun)
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 516eb45..d7a1328 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -396,6 +396,13 @@
(define_mode_iterator VF1_AVX512ER_128_256
[(V16SF "TARGET_AVX512ER") (V8SF "TARGET_AVX") V4SF])
+(define_mode_iterator VFH_AVX512VL
+ [(V32HF "TARGET_AVX512FP16")
+ (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+ (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+ V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
+ V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
+
(define_mode_iterator VF2_AVX512VL
[V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
@@ -405,6 +412,9 @@
(define_mode_iterator VF_AVX512FP16
[V32HF V16HF V8HF])
+(define_mode_iterator VF_AVX512FP16VL
+ [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")])
+
;; All vector integer modes
(define_mode_iterator VI
[(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
@@ -493,6 +503,11 @@
(define_mode_iterator VI2_AVX512VL
[(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI])
+(define_mode_iterator VI2H_AVX512VL
+ [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI
+ (V8SI "TARGET_AVX512VL") V16SI
+ V8DI ])
+
(define_mode_iterator VI1_AVX512VL_F
[V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F")])
@@ -622,6 +637,9 @@
(V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2")
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")])
+(define_mode_iterator VF4_128_8_256
+ [V4DF V4SF])
+
(define_mode_iterator VI1_AVX512VLBW
[(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL")
(V16QI "TARGET_AVX512VL")])
@@ -707,7 +725,8 @@
[(V16SF "V4SF") (V8DF "V2DF") (V16SI "TI") (V8DI "TI")])
(define_mode_attr vecmemsuffix
- [(V16SF "{z}") (V8SF "{y}") (V4SF "{x}")
+ [(V32HF "{z}") (V16HF "{y}") (V8HF "{x}")
+ (V16SF "{z}") (V8SF "{y}") (V4SF "{x}")
(V8DF "{z}") (V4DF "{y}") (V2DF "{x}")])
(define_mode_attr ssedoublemodelower
@@ -727,6 +746,11 @@
[(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")
(V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")])
+(define_mode_attr sseintconvert
+ [(V32HI "w") (V16HI "w") (V8HI "w")
+ (V16SI "dq") (V8SI "dq") (V4SI "dq")
+ (V8DI "qq") (V4DI "qq") (V2DI "qq")])
+
;; All 128bit vector integer modes
(define_mode_iterator VI_128 [V16QI V8HI V4SI V2DI])
@@ -768,6 +792,7 @@
(V32HF "TARGET_AVX512BW")])
;; Int-float size matches
+(define_mode_iterator VI2F [V8HI V16HI V32HI V8HF V16HF V32HF])
(define_mode_iterator VI4F_128 [V4SI V4SF])
(define_mode_iterator VI8F_128 [V2DI V2DF])
(define_mode_iterator VI4F_256 [V8SI V8SF])
@@ -782,6 +807,12 @@
(V4DI "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")])
(define_mode_iterator VF48_I1248
[V16SI V16SF V8DI V8DF V32HI V64QI])
+(define_mode_iterator VF48H_AVX512VL
+ [V8DF V16SF (V8SF "TARGET_AVX512VL")])
+
+(define_mode_iterator VF48_128
+ [V2DF V4SF])
+
(define_mode_iterator VI48F
[V16SI V16SF V8DI V8DF
(V8SI "TARGET_AVX512VL") (V8SF "TARGET_AVX512VL")
@@ -806,6 +837,7 @@
(V8SF "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")
V16SF V8DF])
+(define_mode_iterator V8_128 [V8HI V8HF])
(define_mode_iterator V16_256 [V16HI V16HF])
(define_mode_iterator V32_512 [V32HI V32HF])
@@ -918,9 +950,9 @@
;; Mapping of vector float modes to an integer mode of the same size
(define_mode_attr sseintvecmode
- [(V16SF "V16SI") (V8DF "V8DI")
- (V8SF "V8SI") (V4DF "V4DI")
- (V4SF "V4SI") (V2DF "V2DI")
+ [(V32HF "V32HI") (V16SF "V16SI") (V8DF "V8DI")
+ (V16HF "V16HI") (V8SF "V8SI") (V4DF "V4DI")
+ (V8HF "V8HI") (V4SF "V4SI") (V2DF "V2DI")
(V16SI "V16SI") (V8DI "V8DI")
(V8SI "V8SI") (V4DI "V4DI")
(V4SI "V4SI") (V2DI "V2DI")
@@ -971,6 +1003,13 @@
(V4SF "v2sf")
(V32HF "v16hf") (V16HF "v8hf") (V8HF "v4hf")])
+;; Mapping of vector modes to vector hf modes of conversion.
+(define_mode_attr ssePHmode
+ [(V32HI "V32HF") (V16HI "V16HF") (V8HI "V8HF")
+ (V16SI "V16HF") (V8SI "V8HF") (V4SI "V8HF")
+ (V8DI "V8HF") (V4DI "V8HF") (V2DI "V8HF")
+ (V8DF "V8HF") (V16SF "V16HF") (V8SF "V8HF")])
+
;; Mapping of vector modes to packed single mode of the same size
(define_mode_attr ssePSmode
[(V16SI "V16SF") (V8DF "V16SF")
@@ -1116,7 +1155,8 @@
;; Mapping of mode to cast intrinsic name
(define_mode_attr castmode
- [(V8SI "si") (V8SF "ps") (V4DF "pd")
+ [(V4SF "ps") (V2DF "pd")
+ (V8SI "si") (V8SF "ps") (V4DF "pd")
(V16SI "si") (V16SF "ps") (V8DF "pd")])
;; i128 for integer vectors and TARGET_AVX2, f128 otherwise.
@@ -1349,13 +1389,13 @@
[(set (match_dup 0) (match_dup 1))])
(define_insn "avx512f_mov<ssescalarmodelower>_mask"
- [(set (match_operand:VF_128 0 "register_operand" "=v")
- (vec_merge:VF_128
- (vec_merge:VF_128
- (match_operand:VF_128 2 "register_operand" "v")
- (match_operand:VF_128 3 "nonimm_or_0_operand" "0C")
+ [(set (match_operand:VFH_128 0 "register_operand" "=v")
+ (vec_merge:VFH_128
+ (vec_merge:VFH_128
+ (match_operand:VFH_128 2 "register_operand" "v")
+ (match_operand:VFH_128 3 "nonimm_or_0_operand" "0C")
(match_operand:QI 4 "register_operand" "Yk"))
- (match_operand:VF_128 1 "register_operand" "v")
+ (match_operand:VFH_128 1 "register_operand" "v")
(const_int 1)))]
"TARGET_AVX512F"
"vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}"
@@ -1368,7 +1408,7 @@
(vec_merge:<ssevecmode>
(vec_merge:<ssevecmode>
(vec_duplicate:<ssevecmode>
- (match_operand:MODEF 1 "memory_operand"))
+ (match_operand:MODEFH 1 "memory_operand"))
(match_operand:<ssevecmode> 2 "nonimm_or_0_operand")
(match_operand:QI 3 "register_operand"))
(match_dup 4)
@@ -1381,7 +1421,7 @@
(vec_merge:<ssevecmode>
(vec_merge:<ssevecmode>
(vec_duplicate:<ssevecmode>
- (match_operand:MODEF 1 "memory_operand" "m"))
+ (match_operand:MODEFH 1 "memory_operand" "m"))
(match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C")
(match_operand:QI 3 "register_operand" "Yk"))
(match_operand:<ssevecmode> 4 "const0_operand" "C")
@@ -1394,11 +1434,11 @@
(set_attr "mode" "<MODE>")])
(define_insn "avx512f_store<mode>_mask"
- [(set (match_operand:MODEF 0 "memory_operand" "=m")
- (if_then_else:MODEF
+ [(set (match_operand:MODEFH 0 "memory_operand" "=m")
+ (if_then_else:MODEFH
(and:QI (match_operand:QI 2 "register_operand" "Yk")
(const_int 1))
- (vec_select:MODEF
+ (vec_select:MODEFH
(match_operand:<ssevecmode> 1 "register_operand" "v")
(parallel [(const_int 0)]))
(match_dup 0)))]
@@ -2338,6 +2378,30 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "SF")])
+(define_insn "avx512fp16_rcp<mode>2<mask_name>"
+ [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=v")
+ (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "vm")]
+ UNSPEC_RCP))]
+ "TARGET_AVX512FP16"
+ "vrcpph\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+ [(set_attr "type" "sse")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512fp16_vmrcpv8hf2<mask_scalar_name>"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_merge:V8HF
+ (unspec:V8HF [(match_operand:V8HF 1 "nonimmediate_operand" "vm")]
+ UNSPEC_RCP)
+ (match_operand:V8HF 2 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vrcpsh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %w1}"
+ [(set_attr "type" "sse")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "HF")])
+
(define_insn "<mask_codefor>rcp14<mode><mask_name>"
[(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
(unspec:VF_AVX512VL
@@ -2381,8 +2445,8 @@
(set_attr "mode" "<MODE>")])
(define_expand "sqrt<mode>2"
- [(set (match_operand:VF2 0 "register_operand")
- (sqrt:VF2 (match_operand:VF2 1 "vector_operand")))]
+ [(set (match_operand:VF2H 0 "register_operand")
+ (sqrt:VF2H (match_operand:VF2H 1 "vector_operand")))]
"TARGET_SSE2")
(define_expand "sqrt<mode>2"
@@ -2402,8 +2466,8 @@
})
(define_insn "<sse>_sqrt<mode>2<mask_name><round_name>"
- [(set (match_operand:VF 0 "register_operand" "=x,v")
- (sqrt:VF (match_operand:VF 1 "<round_nimm_predicate>" "xBm,<round_constraint>")))]
+ [(set (match_operand:VFH 0 "register_operand" "=x,v")
+ (sqrt:VFH (match_operand:VFH 1 "<round_nimm_predicate>" "xBm,<round_constraint>")))]
"TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
"@
sqrt<ssemodesuffix>\t{%1, %0|%0, %1}
@@ -2416,11 +2480,11 @@
(set_attr "mode" "<MODE>")])
(define_insn "<sse>_vmsqrt<mode>2<mask_scalar_name><round_scalar_name>"
- [(set (match_operand:VF_128 0 "register_operand" "=x,v")
- (vec_merge:VF_128
- (sqrt:VF_128
- (match_operand:VF_128 1 "nonimmediate_operand" "xm,<round_scalar_constraint>"))
- (match_operand:VF_128 2 "register_operand" "0,v")
+ [(set (match_operand:VFH_128 0 "register_operand" "=x,v")
+ (vec_merge:VFH_128
+ (sqrt:VFH_128
+ (match_operand:VFH_128 1 "nonimmediate_operand" "xm,<round_scalar_constraint>"))
+ (match_operand:VFH_128 2 "register_operand" "0,v")
(const_int 1)))]
"TARGET_SSE"
"@
@@ -2473,6 +2537,16 @@
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "<MODE>")])
+(define_insn "<sse>_rsqrt<mode>2<mask_name>"
+ [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=v")
+ (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "vector_operand" "vBm")] UNSPEC_RSQRT))]
+ "TARGET_AVX512FP16"
+ "vrsqrtph\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+ [(set_attr "type" "sse")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<MODE>")])
+
(define_insn "<mask_codefor>rsqrt14<mode><mask_name>"
[(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
(unspec:VF_AVX512VL
@@ -2548,6 +2622,19 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "SF")])
+(define_insn "avx512fp16_vmrsqrtv8hf2<mask_scalar_name>"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_merge:V8HF
+ (unspec:V8HF [(match_operand:V8HF 1 "nonimmediate_operand" "vm")]
+ UNSPEC_RSQRT)
+ (match_operand:V8HF 2 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vrsqrtsh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %w1}"
+ [(set_attr "type" "sse")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "HF")])
+
(define_expand "cond_<code><mode>"
[(set (match_operand:VF 0 "register_operand")
(vec_merge:VF
@@ -3200,28 +3287,28 @@
})
(define_insn "<mask_codefor>reducep<mode><mask_name><round_saeonly_name>"
- [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
- (unspec:VF_AVX512VL
- [(match_operand:VF_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+ [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
+ (unspec:VFH_AVX512VL
+ [(match_operand:VFH_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
(match_operand:SI 2 "const_0_to_255_operand")]
UNSPEC_REDUCE))]
- "TARGET_AVX512DQ"
+ "TARGET_AVX512DQ || (VALID_AVX512FP16_REG_MODE (<MODE>mode))"
"vreduce<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}"
[(set_attr "type" "sse")
(set_attr "prefix" "evex")
(set_attr "mode" "<MODE>")])
(define_insn "reduces<mode><mask_scalar_name><round_saeonly_scalar_name>"
- [(set (match_operand:VF_128 0 "register_operand" "=v")
- (vec_merge:VF_128
- (unspec:VF_128
- [(match_operand:VF_128 1 "register_operand" "v")
- (match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
+ [(set (match_operand:VFH_128 0 "register_operand" "=v")
+ (vec_merge:VFH_128
+ (unspec:VFH_128
+ [(match_operand:VFH_128 1 "register_operand" "v")
+ (match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
(match_operand:SI 3 "const_0_to_255_operand")]
UNSPEC_REDUCE)
(match_dup 1)
(const_int 1)))]
- "TARGET_AVX512DQ"
+ "TARGET_AVX512DQ || (VALID_AVX512FP16_REG_MODE (<MODE>mode))"
"vreduce<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"
[(set_attr "type" "sse")
(set_attr "prefix" "evex")
@@ -5655,6 +5742,552 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
+;; Parallel half-precision floating point conversion operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_int_iterator UNSPEC_US_FIX_NOTRUNC
+ [UNSPEC_UNSIGNED_FIX_NOTRUNC UNSPEC_FIX_NOTRUNC])
+
+(define_int_attr sseintconvertsignprefix
+ [(UNSPEC_UNSIGNED_FIX_NOTRUNC "u")
+ (UNSPEC_FIX_NOTRUNC "")])
+
+(define_mode_attr qq2phsuff
+ [(V32HI "") (V16HI "") (V8HI "")
+ (V16SI "") (V8SI "{y}") (V4SI "{x}")
+ (V8DI "{z}") (V4DI "{y}") (V2DI "{x}")
+ (V16SF "") (V8SF "{y}") (V4SF "{x}")
+ (V8DF "{z}") (V4DF "{y}") (V2DF "{x}")])
+
+(define_insn "avx512fp16_vcvtph2<sseintconvertsignprefix><sseintconvert>_<mode><mask_name><round_name>"
+ [(set (match_operand:VI248_AVX512VL 0 "register_operand" "=v")
+ (unspec:VI248_AVX512VL
+ [(match_operand:<ssePHmode> 1 "<round_nimm_predicate>" "<round_constraint>")]
+ UNSPEC_US_FIX_NOTRUNC))]
+ "TARGET_AVX512FP16"
+ "vcvtph2<sseintconvertsignprefix><sseintconvert>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode><mask_name><round_name>"
+ [(set (match_operand:<ssePHmode> 0 "register_operand" "=v")
+ (any_float:<ssePHmode>
+ (match_operand:VI2H_AVX512VL 1 "<round_nimm_predicate>" "<round_constraint>")))]
+ "TARGET_AVX512FP16"
+ "vcvt<floatsuffix><sseintconvert>2ph<round_qq2phsuff>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
+ (match_dup 2)))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "operands[2] = CONST0_RTX (V4HFmode);")
+
+(define_insn "*avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
+ (match_operand:V4HF 2 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0|%0, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>_mask"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V4HF
+ (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
+ (vec_select:V4HF
+ (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+ (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))
+ (match_operand:QI 3 "register_operand" "Yk"))
+ (match_dup 4)))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "operands[4] = CONST0_RTX (V4HFmode);")
+
+(define_insn "*avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>_mask"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V4HF
+ (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm"))
+ (vec_select:V4HF
+ (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+ (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))
+ (match_operand:QI 3 "register_operand" "Yk"))
+ (match_operand:V4HF 4 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>_mask_1"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V4HF
+ (any_float:V4HF (match_operand:VI4_128_8_256 1
+ "vector_operand" "vm"))
+ (match_operand:V4HF 3 "const0_operand" "C")
+ (match_operand:QI 2 "register_operand" "Yk"))
+ (match_operand:V4HF 4 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvt<floatsuffix>qq2ph_v2di"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
+ (match_dup 2)))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "operands[2] = CONST0_RTX (V6HFmode);")
+
+(define_insn "*avx512fp16_vcvt<floatsuffix>qq2ph_v2di"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
+ (match_operand:V6HF 2 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvt<floatsuffix>qq2ph{x}\t{%1, %0|%0, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_expand "avx512fp16_vcvt<floatsuffix>qq2ph_v2di_mask"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V2HF
+ (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
+ (vec_select:V2HF
+ (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+ (parallel [(const_int 0) (const_int 1)]))
+ (match_operand:QI 3 "register_operand" "Yk"))
+ (match_dup 4)))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "operands[4] = CONST0_RTX (V6HFmode);")
+
+(define_insn "*avx512fp16_vcvt<floatsuffix>qq2ph_v2di_mask"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V2HF
+ (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm"))
+ (vec_select:V2HF
+ (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+ (parallel [(const_int 0) (const_int 1)]))
+ (match_operand:QI 3 "register_operand" "Yk"))
+ (match_operand:V6HF 4 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvt<floatsuffix>qq2ph{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "*avx512fp16_vcvt<floatsuffix>qq2ph_v2di_mask_1"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V2HF
+ (any_float:V2HF (match_operand:V2DI 1
+ "vector_operand" "vm"))
+ (match_operand:V2HF 3 "const0_operand" "C")
+ (match_operand:QI 2 "register_operand" "Yk"))
+ (match_operand:V6HF 4 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvt<floatsuffix>qq2ph{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvtsh2<sseintconvertsignprefix>si<rex64namesuffix><round_name>"
+ [(set (match_operand:SWI48 0 "register_operand" "=r")
+ (unspec:SWI48
+ [(vec_select:HF
+ (match_operand:V8HF 1 "register_operand" "v")
+ (parallel [(const_int 0)]))]
+ UNSPEC_US_FIX_NOTRUNC))]
+ "TARGET_AVX512FP16"
+ "vcvtsh2<sseintconvertsignprefix>si\t{<round_op2>%1, %0|%0, %1<round_op2>}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512fp16_vcvtsh2<sseintconvertsignprefix>si<rex64namesuffix>_2"
+ [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+ (unspec:SWI48
+ [(match_operand:HF 1 "nonimmediate_operand" "v,m")]
+ UNSPEC_US_FIX_NOTRUNC))]
+ "TARGET_AVX512FP16"
+ "vcvtsh2<sseintconvertsignprefix>si\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<MODE>")])
+
+(define_mode_attr sseicvtsuffix
+ [(SI "l") (DI "q")])
+(define_insn "avx512fp16_vcvt<floatsuffix>si2sh<rex64namesuffix><round_name>"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_merge:V8HF
+ (vec_duplicate:V8HF
+ (any_float:HF
+ (match_operand:SWI48 2 "<round_nimm_scalar_predicate>" "<round_constraint3>")))
+ (match_operand:V8HF 1 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vcvt<floatsuffix>si2sh{<sseicvtsuffix>}\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "HF")])
+
+(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name><round_saeonly_name>"
+ [(set (match_operand:VI2H_AVX512VL 0 "register_operand" "=v")
+ (any_fix:VI2H_AVX512VL
+ (match_operand:<ssePHmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))]
+ "TARGET_AVX512FP16"
+ "vcvttph2<fixsuffix><sseintconvert>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name>"
+ [(set (match_operand:VI4_128_8_256 0 "register_operand" "=v")
+ (any_fix:VI4_128_8_256
+ (vec_select:V4HF
+ (match_operand:V8HF 1 "register_operand" "v")
+ (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvttph2<fixsuffix><sseintconvert>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx512fp16_fix<fixunssuffix>_trunc<mode>2_load<mask_name>"
+ [(set (match_operand:VI4_128_8_256 0 "register_operand" "=v")
+ (any_fix:VI4_128_8_256
+ (match_operand:V4HF 1 "memory_operand" "m")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvttph2<fixsuffix><sseintconvert>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512fp16_fix<fixunssuffix>_truncv2di2<mask_name>"
+ [(set (match_operand:V2DI 0 "register_operand" "=v")
+ (any_fix:V2DI
+ (vec_select:V2HF
+ (match_operand:V8HF 1 "nonimmediate_operand" "v")
+ (parallel [(const_int 0) (const_int 1)]))))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvttph2<fixsuffix>qq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "*avx512fp16_fix<fixunssuffix>_truncv2di2_load<mask_name>"
+ [(set (match_operand:V2DI 0 "register_operand" "=v")
+ (any_fix:V2DI
+ (match_operand:V2HF 1 "memory_operand" "m")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvttph2<fixsuffix>qq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<round_saeonly_name>"
+ [(set (match_operand:SWI48 0 "register_operand" "=r")
+ (any_fix:SWI48
+ (vec_select:HF
+ (match_operand:V8HF 1 "register_operand" "v")
+ (parallel [(const_int 0)]))))]
+ "TARGET_AVX512FP16"
+ "%vcvttsh2<fixsuffix>si\t{<round_saeonly_op2>%1, %0|%0, %k1<round_saeonly_op2>}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2_mem"
+ [(set (match_operand:SWI48 0 "register_operand" "=r")
+ (any_fix:SWI48
+ (match_operand:HF 1 "memory_operand" "vm")))]
+ "TARGET_AVX512FP16"
+ "%vcvttsh2<fixsuffix>si\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sseicvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<MODE>")])
+
+(define_mode_attr ph2pssuffix
+ [(V16SF "x") (V8SF "x") (V4SF "x")
+ (V8DF "") (V4DF "") (V2DF "")])
+
+(define_insn "avx512fp16_float_extend_ph<mode>2<mask_name><round_saeonly_name>"
+ [(set (match_operand:VF48H_AVX512VL 0 "register_operand" "=v")
+ (float_extend:VF48H_AVX512VL
+ (match_operand:<ssePHmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))]
+ "TARGET_AVX512FP16"
+ "vcvtph2<castmode><ph2pssuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512fp16_float_extend_ph<mode>2<mask_name>"
+ [(set (match_operand:VF4_128_8_256 0 "register_operand" "=v")
+ (float_extend:VF4_128_8_256
+ (vec_select:V4HF
+ (match_operand:V8HF 1 "register_operand" "v")
+ (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvtph2<castmode><ph2pssuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx512fp16_float_extend_ph<mode>2_load<mask_name>"
+ [(set (match_operand:VF4_128_8_256 0 "register_operand" "=v")
+ (float_extend:VF4_128_8_256
+ (match_operand:V4HF 1 "memory_operand" "m")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvtph2<castmode><ph2pssuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512fp16_float_extend_phv2df2<mask_name>"
+ [(set (match_operand:V2DF 0 "register_operand" "=v")
+ (float_extend:V2DF
+ (vec_select:V2HF
+ (match_operand:V8HF 1 "register_operand" "v")
+ (parallel [(const_int 0) (const_int 1)]))))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvtph2pd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "*avx512fp16_float_extend_phv2df2_load<mask_name>"
+ [(set (match_operand:V2DF 0 "register_operand" "=v")
+ (float_extend:V2DF
+ (match_operand:V2HF 1 "memory_operand" "m")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvtph2pd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvt<castmode>2ph_<mode><mask_name><round_name>"
+ [(set (match_operand:<ssePHmode> 0 "register_operand" "=v")
+ (float_truncate:<ssePHmode>
+ (match_operand:VF48H_AVX512VL 1 "<round_nimm_predicate>" "<round_constraint>")))]
+ "TARGET_AVX512FP16"
+ "vcvt<castmode>2ph<ph2pssuffix><round_qq2phsuff>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvt<castmode>2ph_<mode>"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (float_truncate:V4HF
+ (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+ (match_dup 2)))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "operands[2] = CONST0_RTX (V4HFmode);")
+
+(define_insn "*avx512fp16_vcvt<castmode>2ph_<mode>"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (float_truncate:V4HF
+ (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+ (match_operand:V4HF 2 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0|%0, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvt<castmode>2ph_<mode>_mask"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V4HF
+ (float_truncate:V4HF
+ (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+ (vec_select:V4HF
+ (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)]))
+ (match_operand:QI 3 "register_operand" "Yk"))
+ (match_dup 4)))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "operands[4] = CONST0_RTX (V4HFmode);")
+
+(define_insn "*avx512fp16_vcvt<castmode>2ph_<mode>_mask"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V4HF
+ (float_truncate:V4HF
+ (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+ (vec_select:V4HF
+ (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)]))
+ (match_operand:QI 3 "register_operand" "Yk"))
+ (match_operand:V4HF 4 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx512fp16_vcvt<castmode>2ph_<mode>_mask_1"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V4HF
+ (float_truncate:V4HF
+ (match_operand:VF4_128_8_256 1 "vector_operand" "vm"))
+ (match_operand:V4HF 3 "const0_operand" "C")
+ (match_operand:QI 2 "register_operand" "Yk"))
+ (match_operand:V4HF 4 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512fp16_vcvtpd2ph_v2df"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (float_truncate:V2HF
+ (match_operand:V2DF 1 "vector_operand" "vm"))
+ (match_dup 2)))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "operands[2] = CONST0_RTX (V6HFmode);")
+
+(define_insn "*avx512fp16_vcvtpd2ph_v2df"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (float_truncate:V2HF
+ (match_operand:V2DF 1 "vector_operand" "vm"))
+ (match_operand:V6HF 2 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvtpd2ph{x}\t{%1, %0|%0, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_expand "avx512fp16_vcvtpd2ph_v2df_mask"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V2HF
+ (float_truncate:V2HF
+ (match_operand:V2DF 1 "vector_operand" "vm"))
+ (vec_select:V2HF
+ (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+ (parallel [(const_int 0) (const_int 1)]))
+ (match_operand:QI 3 "register_operand" "Yk"))
+ (match_dup 4)))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "operands[4] = CONST0_RTX (V6HFmode);")
+
+(define_insn "*avx512fp16_vcvtpd2ph_v2df_mask"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V2HF
+ (float_truncate:V2HF (match_operand:V2DF 1 "vector_operand" "vm"))
+ (vec_select:V2HF
+ (match_operand:V8HF 2 "nonimm_or_0_operand" "0C")
+ (parallel [(const_int 0) (const_int 1)]))
+ (match_operand:QI 3 "register_operand" "Yk"))
+ (match_operand:V6HF 4 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvtpd2ph{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "*avx512fp16_vcvtpd2ph_v2df_mask_1"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_concat:V8HF
+ (vec_merge:V2HF
+ (float_truncate:V2HF
+ (match_operand:V2DF 1 "vector_operand" "vm"))
+ (match_operand:V2HF 3 "const0_operand" "C")
+ (match_operand:QI 2 "register_operand" "Yk"))
+ (match_operand:V6HF 4 "const0_operand" "C")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+ "vcvtpd2ph{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvtsh2<ssescalarmodesuffix><mask_scalar_name><round_saeonly_scalar_name>"
+ [(set (match_operand:VF48_128 0 "register_operand" "=v")
+ (vec_merge:VF48_128
+ (vec_duplicate:VF48_128
+ (float_extend:<ssescalarmode>
+ (vec_select:HF
+ (match_operand:V8HF 1 "register_operand" "v")
+ (parallel [(const_int 0)]))))
+ (match_operand:VF48_128 2 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vcvtsh2<ssescalarmodesuffix>\t{<round_saeonly_scalar_mask_op3>%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1<round_saeonly_scalar_mask_op3>}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvtsh2<ssescalarmodesuffix><mask_scalar_name>_mem"
+ [(set (match_operand:VF48_128 0 "register_operand" "=v")
+ (vec_merge:VF48_128
+ (vec_duplicate:VF48_128
+ (float_extend:<ssescalarmode>
+ (match_operand:HF 1 "memory_operand" "m")))
+ (match_operand:VF48_128 2 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vcvtsh2<ssescalarmodesuffix>\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvt<ssescalarmodesuffix>2sh<mask_scalar_name><round_scalar_name>"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_merge:V8HF
+ (vec_duplicate:V8HF
+ (float_truncate:HF
+ (vec_select:<ssescalarmode>
+ (match_operand:VF48_128 1 "register_operand" "v")
+ (parallel [(const_int 0)]))))
+ (match_operand:V8HF 2 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vcvt<ssescalarmodesuffix>2sh\t{<round_scalar_mask_op3>%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1<round_scalar_mask_op3>}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+(define_insn "avx512fp16_vcvt<ssescalarmodesuffix>2sh<mask_scalar_name>_mem"
+ [(set (match_operand:V8HF 0 "register_operand" "=v")
+ (vec_merge:V8HF
+ (vec_duplicate:V8HF
+ (float_truncate:HF
+ (match_operand:MODEF 1 "memory_operand" "m")))
+ (match_operand:V8HF 2 "register_operand" "v")
+ (const_int 1)))]
+ "TARGET_AVX512FP16"
+ "vcvt<ssescalarmodesuffix>2sh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1}"
+ [(set_attr "type" "ssecvt")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "TI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
;; Parallel single-precision floating point conversion operations
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -8759,11 +9392,11 @@
;; vmovw clears also the higer bits
(define_insn "vec_set<mode>_0"
- [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v,v")
- (vec_merge:VF_AVX512FP16
- (vec_duplicate:VF_AVX512FP16
- (match_operand:HF 2 "nonimmediate_operand" "r,m"))
- (match_operand:VF_AVX512FP16 1 "const0_operand" "C,C")
+ [(set (match_operand:VI2F 0 "register_operand" "=v,v")
+ (vec_merge:VI2F
+ (vec_duplicate:VI2F
+ (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,m"))
+ (match_operand:VI2F 1 "const0_operand" "C,C")
(const_int 1)))]
"TARGET_AVX512FP16"
"@
@@ -9031,7 +9664,8 @@
[(V16SF "avx512f") (V16SI "avx512f") (V8DF "avx512dq") (V8DI "avx512dq")])
(define_mode_attr extract_suf
- [(V16SF "32x4") (V16SI "32x4") (V8DF "64x2") (V8DI "64x2")])
+ [(V16SF "32x4") (V16SI "32x4") (V8DF "64x2") (V8DI "64x2")
+ (V8SF "32x4") (V8SI "32x4") (V4DF "64x2") (V4DI "64x2")])
(define_mode_iterator AVX512_VEC
[(V8DF "TARGET_AVX512DQ") (V8DI "TARGET_AVX512DQ") V16SF V16SI])
@@ -9891,16 +10525,33 @@
"operands[1] = gen_lowpart (HFmode, operands[1]);")
(define_insn "*vec_extracthf"
- [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=r,m")
+ [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=*r,m,x,v")
(vec_select:HF
- (match_operand:V8HF 1 "register_operand" "v,v")
+ (match_operand:V8HF 1 "register_operand" "v,v,0,v")
(parallel
[(match_operand:SI 2 "const_0_to_7_operand")])))]
"TARGET_SSE2"
- "@
- vpextrw\t{%2, %1, %k0|%k0, %1, %2}
- vpextrw\t{%2, %1, %0|%0, %1, %2}"
- [(set_attr "type" "sselog1")
+{
+ switch (which_alternative)
+ {
+ case 0:
+ return "vpextrw\t{%2, %1, %k0|%k0, %1, %2}";
+ case 1:
+ return "vpextrw\t{%2, %1, %0|%0, %1, %2}";
+
+ case 2:
+ operands[2] = GEN_INT (INTVAL (operands[2]) * 2);
+ return "psrldq\t{%2, %0|%0, %2}";
+ case 3:
+ operands[2] = GEN_INT (INTVAL (operands[2]) * 2);
+ return "vpsrldq\t{%2, %1, %0|%0, %1, %2}";
+
+ default:
+ gcc_unreachable ();
+ }
+}
+ [(set_attr "isa" "*,*,noavx,avx")
+ (set_attr "type" "sselog1,sselog1,sseishft1,sseishft1")
(set_attr "prefix" "maybe_evex")
(set_attr "mode" "TI")])
@@ -10255,11 +10906,11 @@
})
(define_insn "avx512f_vmscalef<mode><mask_scalar_name><round_scalar_name>"
- [(set (match_operand:VF_128 0 "register_operand" "=v")
- (vec_merge:VF_128
- (unspec:VF_128
- [(match_operand:VF_128 1 "register_operand" "v")
- (match_operand:VF_128 2 "<round_scalar_nimm_predicate>" "<round_scalar_constraint>")]
+ [(set (match_operand:VFH_128 0 "register_operand" "=v")
+ (vec_merge:VFH_128
+ (unspec:VFH_128
+ [(match_operand:VFH_128 1 "register_operand" "v")
+ (match_operand:VFH_128 2 "<round_scalar_nimm_predicate>" "<round_scalar_constraint>")]
UNSPEC_SCALEF)
(match_dup 1)
(const_int 1)))]
@@ -10269,10 +10920,10 @@
(set_attr "mode" "<ssescalarmode>")])
(define_insn "<avx512>_scalef<mode><mask_name><round_name>"
- [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
- (unspec:VF_AVX512VL
- [(match_operand:VF_AVX512VL 1 "register_operand" "v")
- (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")]
+ [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
+ (unspec:VFH_AVX512VL
+ [(match_operand:VFH_AVX512VL 1 "register_operand" "v")
+ (match_operand:VFH_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")]
UNSPEC_SCALEF))]
"TARGET_AVX512F"
"vscalef<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}"
@@ -10558,8 +11209,8 @@
(set_attr "mode" "<sseinsnmode>")])
(define_insn "<avx512>_getexp<mode><mask_name><round_saeonly_name>"
- [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
- (unspec:VF_AVX512VL [(match_operand:VF_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+ [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
+ (unspec:VFH_AVX512VL [(match_operand:VFH_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
UNSPEC_GETEXP))]
"TARGET_AVX512F"
"vgetexp<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}";
@@ -10567,11 +11218,11 @@
(set_attr "mode" "<MODE>")])
(define_insn "avx512f_sgetexp<mode><mask_scalar_name><round_saeonly_scalar_name>"
- [(set (match_operand:VF_128 0 "register_operand" "=v")
- (vec_merge:VF_128
- (unspec:VF_128
- [(match_operand:VF_128 1 "register_operand" "v")
- (match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")]
+ [(set (match_operand:VFH_128 0 "register_operand" "=v")
+ (vec_merge:VFH_128
+ (unspec:VFH_128
+ [(match_operand:VFH_128 1 "register_operand" "v")
+ (match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")]
UNSPEC_GETEXP)
(match_dup 1)
(const_int 1)))]
@@ -10603,9 +11254,21 @@
(match_operand:V48_256_512_AVX512VL 1 "register_operand" "v")
(parallel [(match_operand 2 "<vec_extract_imm_predicate>")])))]
"TARGET_AVX512F
- && INTVAL(operands[2]) >= 16 / GET_MODE_SIZE (<ssescalarmode>mode)"
- "valign<ternlogsuffix>\t{%2, %1, %1, %<xtg_mode>0|%<xtg_mode>0, %1, %1, %2}";
- [(set_attr "prefix" "evex")
+ && INTVAL(operands[2]) * GET_MODE_SIZE (<ssescalarmode>mode) >= 16"
+{
+ int byte_offset = INTVAL (operands[2]) * GET_MODE_SIZE (<ssescalarmode>mode);
+ if (byte_offset % 16 == 0)
+ {
+ operands[2] = GEN_INT (byte_offset / 16);
+ if (byte_offset / 16 == 1)
+ return "vextract<shuffletype><extract_suf>\t{%2, %t1, %x0|%x0, %t1, %2}";
+ else
+ return "vextract<shuffletype><extract_suf>\t{%2, %1, %x0|%x0, %1, %2}";
+ }
+ else
+ return "valign<ternlogsuffix>\t{%2, %1, %1, %<xtg_mode>0|%<xtg_mode>0, %1, %1, %2}";
+}
+ [(set_attr "prefix" "maybe_evex")
(set_attr "mode" "<sseintvecinsnmode>")])
(define_expand "avx512f_shufps512_mask"
@@ -10737,9 +11400,9 @@
(set_attr "mode" "<ssescalarmode>")])
(define_insn "<avx512>_rndscale<mode><mask_name><round_saeonly_name>"
- [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
- (unspec:VF_AVX512VL
- [(match_operand:VF_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>")
+ [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
+ (unspec:VFH_AVX512VL
+ [(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>")
(match_operand:SI 2 "const_0_to_255_operand")]
UNSPEC_ROUND))]
"TARGET_AVX512F"
@@ -10749,13 +11412,13 @@
(set_attr "mode" "<MODE>")])
(define_insn "avx512f_rndscale<mode><mask_scalar_name><round_saeonly_scalar_name>"
- [(set (match_operand:VF_128 0 "register_operand" "=v")
- (vec_merge:VF_128
- (unspec:VF_128
- [(match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
+ [(set (match_operand:VFH_128 0 "register_operand" "=v")
+ (vec_merge:VFH_128
+ (unspec:VFH_128
+ [(match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
(match_operand:SI 3 "const_0_to_255_operand")]
UNSPEC_ROUND)
- (match_operand:VF_128 1 "register_operand" "v")
+ (match_operand:VFH_128 1 "register_operand" "v")
(const_int 1)))]
"TARGET_AVX512F"
"vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}"
@@ -10764,14 +11427,14 @@
(set_attr "mode" "<MODE>")])
(define_insn "*avx512f_rndscale<mode><round_saeonly_name>"
- [(set (match_operand:VF_128 0 "register_operand" "=v")
- (vec_merge:VF_128
- (vec_duplicate:VF_128
+ [(set (match_operand:VFH_128 0 "register_operand" "=v")
+ (vec_merge:VFH_128
+ (vec_duplicate:VFH_128
(unspec:<ssescalarmode>
[(match_operand:<ssescalarmode> 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
(match_operand:SI 3 "const_0_to_255_operand")]
UNSPEC_ROUND))
- (match_operand:VF_128 1 "register_operand" "v")
+ (match_operand:VFH_128 1 "register_operand" "v")
(const_int 1)))]
"TARGET_AVX512F"
"vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %2<round_saeonly_op4>, %3}"
@@ -15359,12 +16022,12 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "TI")])
-(define_insn "avx512bw_interleave_highv32hi<mask_name>"
- [(set (match_operand:V32HI 0 "register_operand" "=v")
- (vec_select:V32HI
- (vec_concat:V64HI
- (match_operand:V32HI 1 "register_operand" "v")
- (match_operand:V32HI 2 "nonimmediate_operand" "vm"))
+(define_insn "avx512bw_interleave_high<mode><mask_name>"
+ [(set (match_operand:V32_512 0 "register_operand" "=v")
+ (vec_select:V32_512
+ (vec_concat:<ssedoublevecmode>
+ (match_operand:V32_512 1 "register_operand" "v")
+ (match_operand:V32_512 2 "nonimmediate_operand" "vm"))
(parallel [(const_int 4) (const_int 36)
(const_int 5) (const_int 37)
(const_int 6) (const_int 38)
@@ -15387,12 +16050,12 @@
(set_attr "prefix" "evex")
(set_attr "mode" "XI")])
-(define_insn "avx2_interleave_highv16hi<mask_name>"
- [(set (match_operand:V16HI 0 "register_operand" "=Yw")
- (vec_select:V16HI
- (vec_concat:V32HI
- (match_operand:V16HI 1 "register_operand" "Yw")
- (match_operand:V16HI 2 "nonimmediate_operand" "Ywm"))
+(define_insn "avx2_interleave_high<mode><mask_name>"
+ [(set (match_operand:V16_256 0 "register_operand" "=Yw")
+ (vec_select:V16_256
+ (vec_concat:<ssedoublevecmode>
+ (match_operand:V16_256 1 "register_operand" "Yw")
+ (match_operand:V16_256 2 "nonimmediate_operand" "Ywm"))
(parallel [(const_int 4) (const_int 20)
(const_int 5) (const_int 21)
(const_int 6) (const_int 22)
@@ -15407,12 +16070,12 @@
(set_attr "prefix" "maybe_evex")
(set_attr "mode" "OI")])
-(define_insn "vec_interleave_highv8hi<mask_name>"
- [(set (match_operand:V8HI 0 "register_operand" "=x,Yw")
- (vec_select:V8HI
- (vec_concat:V16HI
- (match_operand:V8HI 1 "register_operand" "0,Yw")
- (match_operand:V8HI 2 "vector_operand" "xBm,Ywm"))
+(define_insn "vec_interleave_high<mode><mask_name>"
+ [(set (match_operand:V8_128 0 "register_operand" "=x,Yw")
+ (vec_select:V8_128
+ (vec_concat:<ssedoublevecmode>
+ (match_operand:V8_128 1 "register_operand" "0,Yw")
+ (match_operand:V8_128 2 "vector_operand" "xBm,Ywm"))
(parallel [(const_int 4) (const_int 12)
(const_int 5) (const_int 13)
(const_int 6) (const_int 14)
@@ -15427,12 +16090,12 @@
(set_attr "prefix" "orig,maybe_vex")
(set_attr "mode" "TI")])
-(define_insn "<mask_codefor>avx512bw_interleave_lowv32hi<mask_name>"
- [(set (match_operand:V32HI 0 "register_operand" "=v")
- (vec_select:V32HI
- (vec_concat:V64HI
- (match_operand:V32HI 1 "register_operand" "v")
- (match_operand:V32HI 2 "nonimmediate_operand" "vm"))
+(define_insn "<mask_codefor>avx512bw_interleave_low<mode><mask_name>"
+ [(set (match_operand:V32_512 0 "register_operand" "=v")
+ (vec_select:V32_512
+ (vec_concat:<ssedoublevecmode>
+ (match_operand:V32_512 1 "register_operand" "v")
+ (match_operand:V32_512 2 "nonimmediate_operand" "vm"))
(parallel [(const_int 0) (const_int 32)
(const_int 1) (const_int 33)
(const_int 2) (const_int 34)
@@ -15455,12 +16118,12 @@
(set_attr "prefix" "evex")
(set_attr "mode" "XI")])
-(define_insn "avx2_interleave_lowv16hi<mask_name>"
- [(set (match_operand:V16HI 0 "register_operand" "=Yw")
- (vec_select:V16HI
- (vec_concat:V32HI
- (match_operand:V16HI 1 "register_operand" "Yw")
- (match_operand:V16HI 2 "nonimmediate_operand" "Ywm"))
+(define_insn "avx2_interleave_low<mode><mask_name>"
+ [(set (match_operand:V16_256 0 "register_operand" "=Yw")
+ (vec_select:V16_256
+ (vec_concat:<ssedoublevecmode>
+ (match_operand:V16_256 1 "register_operand" "Yw")
+ (match_operand:V16_256 2 "nonimmediate_operand" "Ywm"))
(parallel [(const_int 0) (const_int 16)
(const_int 1) (const_int 17)
(const_int 2) (const_int 18)
@@ -15475,12 +16138,12 @@
(set_attr "prefix" "maybe_evex")
(set_attr "mode" "OI")])
-(define_insn "vec_interleave_lowv8hi<mask_name>"
- [(set (match_operand:V8HI 0 "register_operand" "=x,Yw")
- (vec_select:V8HI
- (vec_concat:V16HI
- (match_operand:V8HI 1 "register_operand" "0,Yw")
- (match_operand:V8HI 2 "vector_operand" "xBm,Ywm"))
+(define_insn "vec_interleave_low<mode><mask_name>"
+ [(set (match_operand:V8_128 0 "register_operand" "=x,Yw")
+ (vec_select:V8_128
+ (vec_concat:<ssedoublevecmode>
+ (match_operand:V8_128 1 "register_operand" "0,Yw")
+ (match_operand:V8_128 2 "vector_operand" "xBm,Ywm"))
(parallel [(const_int 0) (const_int 8)
(const_int 1) (const_int 9)
(const_int 2) (const_int 10)
@@ -15655,6 +16318,7 @@
(V4SI "avx512dq") (V2DI "avx512dq")])
;; sse4_1_pinsrd must come before sse2_loadld since it is preferred.
+;; For V8HFmode and TARGET_AVX2, broadcastw + pblendw should be better.
(define_insn "<sse2p4_1>_pinsr<ssemodesuffix>"
[(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x,v,v")
(vec_merge:PINSR_MODE
@@ -15664,7 +16328,8 @@
(match_operand:SI 3 "const_int_operand")))]
"TARGET_SSE2
&& ((unsigned) exact_log2 (INTVAL (operands[3]))
- < GET_MODE_NUNITS (<MODE>mode))"
+ < GET_MODE_NUNITS (<MODE>mode))
+ && !(<MODE>mode == V8HFmode && TARGET_AVX2)"
{
operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
@@ -15672,26 +16337,18 @@
{
case 0:
if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
- return "pinsr<ssemodesuffix>\t{%3, %k2, %0|%0, %k2, %3}";
+ return "pinsr<sseintmodesuffix>\t{%3, %k2, %0|%0, %k2, %3}";
/* FALLTHRU */
case 1:
- return "pinsr<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}";
+ return "pinsr<sseintmodesuffix>\t{%3, %2, %0|%0, %2, %3}";
case 2:
case 4:
if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
- {
- if (<MODE>mode == V8HFmode)
- return "vpinsrw\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
- else
- return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
- }
+ return "vpinsr<sseintmodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
/* FALLTHRU */
case 3:
case 5:
- if (<MODE>mode == V8HFmode)
- return "vpinsrw\t{%3, %2, %1, %0|%0, %1, %2, %3}";
- else
- return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+ return "vpinsr<sseintmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
default:
gcc_unreachable ();
}
@@ -19179,11 +19836,14 @@
(lt:VI1_AVX2 (match_dup 3) (match_dup 4))] UNSPEC_BLENDV))]
"operands[3] = gen_lowpart (<MODE>mode, operands[3]);")
-(define_insn "sse4_1_pblendw"
- [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x")
- (vec_merge:V8HI
- (match_operand:V8HI 2 "vector_operand" "YrBm,*xBm,xm")
- (match_operand:V8HI 1 "register_operand" "0,0,x")
+(define_mode_attr blendsuf
+ [(V8HI "w") (V8HF "ph")])
+
+(define_insn "sse4_1_pblend<blendsuf>"
+ [(set (match_operand:V8_128 0 "register_operand" "=Yr,*x,x")
+ (vec_merge:V8_128
+ (match_operand:V8_128 2 "vector_operand" "YrBm,*xBm,xm")
+ (match_operand:V8_128 1 "register_operand" "0,0,x")
(match_operand:SI 3 "const_0_to_255_operand" "n,n,n")))]
"TARGET_SSE4_1"
"@
@@ -19210,6 +19870,47 @@
operands[3] = GEN_INT (val << 8 | val);
})
+(define_expand "avx2_pblendph"
+ [(set (match_operand:V16HF 0 "register_operand")
+ (vec_merge:V16HF
+ (match_operand:V16HF 2 "register_operand")
+ (match_operand:V16HF 1 "register_operand")
+ (match_operand:SI 3 "const_int_operand")))]
+ "TARGET_AVX2
+ && !((INTVAL (operands[3]) & 0xff) && (INTVAL (operands[3]) & 0xff00))"
+{
+ int mask = INTVAL (operands[3]);
+ if (mask == 0)
+ emit_move_insn (operands[0], operands[1]);
+ else
+ {
+ rtx tmp = gen_reg_rtx (V16HImode);
+ rtx blendw_idx, blendd_idx;
+
+ if (mask & 0xff)
+ {
+ blendw_idx = GEN_INT (mask & 0xff);
+ blendd_idx = GEN_INT (15);
+ }
+ else
+ {
+ blendw_idx = GEN_INT (mask >> 8 & 0xff);
+ blendd_idx = GEN_INT (240);
+ }
+ operands[1] = lowpart_subreg (V16HImode, operands[1], V16HFmode);
+ operands[2] = lowpart_subreg (V16HImode, operands[2], V16HFmode);
+ emit_insn (gen_avx2_pblendw (tmp, operands[1], operands[2], blendw_idx));
+
+ operands[0] = lowpart_subreg (V8SImode, operands[0], V16HFmode);
+ tmp = lowpart_subreg (V8SImode, tmp, V16HImode);
+ operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);
+ emit_insn (gen_avx2_pblenddv8si (operands[0], operands[1],
+ tmp, blendd_idx));
+ }
+
+ DONE;
+})
+
(define_insn "*avx2_pblendw"
[(set (match_operand:V16HI 0 "register_operand" "=x")
(vec_merge:V16HI
@@ -24714,10 +25415,10 @@
(define_insn "avx512dq_fpclass<mode><mask_scalar_merge_name>"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
(unspec:<avx512fmaskmode>
- [(match_operand:VF_AVX512VL 1 "vector_operand" "vm")
+ [(match_operand:VFH_AVX512VL 1 "vector_operand" "vm")
(match_operand 2 "const_0_to_255_operand" "n")]
UNSPEC_FPCLASS))]
- "TARGET_AVX512DQ"
+ "TARGET_AVX512DQ || VALID_AVX512FP16_REG_MODE(<MODE>mode)"
"vfpclass<ssemodesuffix><vecmemsuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}";
[(set_attr "type" "sse")
(set_attr "length_immediate" "1")
@@ -24728,11 +25429,11 @@
[(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
(and:<avx512fmaskmode>
(unspec:<avx512fmaskmode>
- [(match_operand:VF_128 1 "nonimmediate_operand" "vm")
+ [(match_operand:VFH_128 1 "nonimmediate_operand" "vm")
(match_operand 2 "const_0_to_255_operand" "n")]
UNSPEC_FPCLASS)
(const_int 1)))]
- "TARGET_AVX512DQ"
+ "TARGET_AVX512DQ || VALID_AVX512FP16_REG_MODE(<MODE>mode)"
"vfpclass<ssescalarmodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}";
[(set_attr "type" "sse")
(set_attr "length_immediate" "1")
@@ -24740,9 +25441,9 @@
(set_attr "mode" "<MODE>")])
(define_insn "<avx512>_getmant<mode><mask_name><round_saeonly_name>"
- [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v")
- (unspec:VF_AVX512VL
- [(match_operand:VF_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>")
+ [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
+ (unspec:VFH_AVX512VL
+ [(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>")
(match_operand:SI 2 "const_0_to_15_operand")]
UNSPEC_GETMANT))]
"TARGET_AVX512F"
@@ -24751,11 +25452,11 @@
(set_attr "mode" "<MODE>")])
(define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>"
- [(set (match_operand:VF_128 0 "register_operand" "=v")
- (vec_merge:VF_128
- (unspec:VF_128
- [(match_operand:VF_128 1 "register_operand" "v")
- (match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
+ [(set (match_operand:VFH_128 0 "register_operand" "=v")
+ (vec_merge:VFH_128
+ (unspec:VFH_128
+ [(match_operand:VFH_128 1 "register_operand" "v")
+ (match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")
(match_operand:SI 3 "const_0_to_15_operand")]
UNSPEC_GETMANT)
(match_dup 1)
diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
index 717561a..157d49f 100644
--- a/gcc/config/i386/subst.md
+++ b/gcc/config/i386/subst.md
@@ -153,6 +153,7 @@
(define_subst_attr "round_mask_op4" "round" "" "<round_mask_operand4>")
(define_subst_attr "round_sd_mask_op4" "round" "" "<round_sd_mask_operand4>")
(define_subst_attr "round_constraint" "round" "vm" "v")
+(define_subst_attr "round_qq2phsuff" "round" "<qq2phsuff>" "")
(define_subst_attr "bcst_round_constraint" "round" "vmBr" "v")
(define_subst_attr "round_constraint2" "round" "m" "v")
(define_subst_attr "round_constraint3" "round" "rm" "r")
diff --git a/gcc/config/i386/vxworks.h b/gcc/config/i386/vxworks.h
index ebda7d9..0676cb4 100644
--- a/gcc/config/i386/vxworks.h
+++ b/gcc/config/i386/vxworks.h
@@ -73,37 +73,37 @@ along with GCC; see the file COPYING3. If not see
VXWORKS_OS_CPP_BUILTINS (); \
if (TARGET_64BIT) \
VX_CPUDEF (X86_64); \
- else if (TARGET_PENTIUM4) \
+ else if (TARGET_CPU_P (PENTIUM4)) \
{ \
VX_CPUDEF (PENTIUM4); \
VX_CPUVDEF (PENTIUM4); \
} \
- else if (TARGET_CORE2) \
+ else if (TARGET_CPU_P (CORE2)) \
VX_CPUDEF (CORE2); \
- else if (TARGET_NEHALEM) \
+ else if (TARGET_CPU_P (NEHALEM)) \
VX_CPUDEF (NEHALEM); \
- else if (TARGET_SANDYBRIDGE) \
+ else if (TARGET_CPU_P (SANDYBRIDGE)) \
VX_CPUDEF (SANDYBRIDGE); \
- else if (TARGET_HASWELL) \
+ else if (TARGET_CPU_P (HASWELL)) \
VX_CPUDEF (HASWELL); \
- else if (TARGET_SILVERMONT) \
+ else if (TARGET_CPU_P (SILVERMONT)) \
VX_CPUDEF (SILVERMONT); \
- else if (TARGET_SKYLAKE || TARGET_SKYLAKE_AVX512) \
+ else if (TARGET_CPU_P (SKYLAKE) || TARGET_CPU_P (SKYLAKE_AVX512)) \
VX_CPUDEF (SKYLAKE); \
- else if (TARGET_GOLDMONT) \
+ else if (TARGET_CPU_P (GOLDMONT)) \
VX_CPUDEF (GOLDMONT); \
else if (TARGET_VXWORKS7) \
VX_CPUDEF (PENTIUM4); \
- else if (TARGET_386) \
+ else if (TARGET_CPU_P (I386)) \
VX_CPUDEF (I80386); \
- else if (TARGET_486) \
+ else if (TARGET_CPU_P (I486)) \
VX_CPUDEF (I80486); \
- else if (TARGET_PENTIUM) \
+ else if (TARGET_CPU_P (PENTIUM)) \
{ \
VX_CPUDEF (PENTIUM); \
VX_CPUVDEF (PENTIUM); \
} \
- else if (TARGET_PENTIUMPRO) \
+ else if (TARGET_CPU_P (PENTIUMPRO)) \
{ \
VX_CPUDEF (PENTIUM2); \
VX_CPUVDEF (PENTIUMPRO); \
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index ffe810f..93644be 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {
"16", /* Func alignment. */
};
+static stringop_algs tremont_memcpy[2] = {
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}},
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}}};
+static stringop_algs tremont_memset[2] = {
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}},
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}}};
+static const
+struct processor_costs tremont_cost = {
+ {
+ /* Start of register allocator costs. integer->integer move cost is 2. */
+ 6, /* cost for loading QImode using movzbl */
+ {6, 6, 6}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {6, 6, 6}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {6, 6, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 12}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {6, 6}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {6, 6}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
+ {6, 6, 6, 10, 15}, /* cost of loading SSE registers
+ in 32,64,128,256 and 512-bit */
+ {6, 6, 6, 10, 15}, /* cost of storing SSE registers
+ in 32,64,128,256 and 512-bit */
+ 6, 6, /* SSE->integer and integer->SSE moves */
+ 6, 6, /* mask->integer and integer->mask moves */
+ {6, 6, 6}, /* cost of loading mask register
+ in QImode, HImode, SImode. */
+ {6, 6, 6}, /* cost if storing mask register
+ in QImode, HImode, SImode. */
+ 2, /* cost of moving mask register. */
+ /* End of register allocator costs. */
+ },
+
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ /* Setting cost to 2 makes our current implementation of synth_mult result in
+ use of unnecessary temporary registers causing regression on several
+ SPECfp benchmarks. */
+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (4)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (22), /* HI */
+ COSTS_N_INSNS (30), /* SI */
+ COSTS_N_INSNS (74), /* DI */
+ COSTS_N_INSNS (74)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 17, /* MOVE_RATIO */
+ 17, /* CLEAR_RATIO */
+ {6, 6, 6}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {6, 6, 6}, /* cost of storing integer registers */
+ {6, 6, 6, 10, 15}, /* cost of loading SSE register
+ in 32bit, 64bit, 128bit, 256bit and 512bit */
+ {6, 6, 6, 10, 15}, /* cost of storing SSE register
+ in 32bit, 64bit, 128bit, 256bit and 512bit */
+ {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
+ {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
+ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
+ 6, /* cost of moving SSE register to integer. */
+ 18, 6, /* Gather load static, per_elt. */
+ 18, 6, /* Gather store static, per_elt. */
+ 32, /* size of l1 cache. */
+ 512, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
+ value is increased to perhaps more appropriate value of 5. */
+ 3, /* Branch cost */
+ COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (5), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (17), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (1), /* cost of FABS instruction. */
+ COSTS_N_INSNS (1), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
+
+ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
+ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
+ COSTS_N_INSNS (4), /* cost of MULSS instruction. */
+ COSTS_N_INSNS (5), /* cost of MULSD instruction. */
+ COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
+ COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
+ COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
+ COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
+ COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
+ COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
+ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
+ tremont_memcpy,
+ tremont_memset,
+ COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
+ COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
+ "16:11:8", /* Loop alignment. */
+ "16:11:8", /* Jump alignment. */
+ "0:0:8", /* Label alignment. */
+ "16", /* Func alignment. */
+};
+
static stringop_algs intel_memcpy[2] = {
{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
diff --git a/gcc/config/i386/x86-tune-sched.c b/gcc/config/i386/x86-tune-sched.c
index 2e5ee4e..56ada99 100644
--- a/gcc/config/i386/x86-tune-sched.c
+++ b/gcc/config/i386/x86-tune-sched.c
@@ -71,6 +71,7 @@ ix86_issue_rate (void)
case PROCESSOR_NEHALEM:
case PROCESSOR_SANDYBRIDGE:
case PROCESSOR_HASWELL:
+ case PROCESSOR_TREMONT:
case PROCESSOR_GENERIC:
return 4;
@@ -429,6 +430,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
case PROCESSOR_NEHALEM:
case PROCESSOR_SANDYBRIDGE:
case PROCESSOR_HASWELL:
+ case PROCESSOR_TREMONT:
case PROCESSOR_GENERIC:
/* Stack engine allows to execute push&pop instructions in parall. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 2f221b1..58e8ead 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -62,6 +62,21 @@ DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
that can be partly masked by careful scheduling of moves. */
DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
+ | m_BDVER | m_ZNVER | m_TREMONT | m_GENERIC)
+
+/* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids
+ partial write to the destination in scalar SSE conversion from FP
+ to FP. */
+DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY,
+ "sse_partial_reg_fp_converts_dependency",
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
+ | m_BDVER | m_ZNVER | m_GENERIC)
+
+/* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial
+ write to the destination in scalar SSE conversion from integer to FP. */
+DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
+ "sse_partial_reg_converts_dependency",
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
| m_BDVER | m_ZNVER | m_GENERIC)
/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
@@ -136,7 +151,7 @@ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
- | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ATHLON_K8)
+ | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8)
/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
considered on critical path. */
@@ -150,14 +165,15 @@ DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */
DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
- m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
+ m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_TREMONT
+ | m_GENERIC)
/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
Some chips, like 486 and Pentium works faster with separate load
and push instructions. */
DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
- | m_GENERIC)
+ | m_TREMONT | m_GENERIC)
/* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
over esp subtraction. */
@@ -198,8 +214,7 @@ DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
than 4 branch instructions in the 16 byte window. */
DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM
- | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL | m_ATHLON_K8
- | m_AMDFAM10)
+ | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
/*****************************************************************************/
/* Integer instruction selection tuning */
@@ -240,11 +255,11 @@ DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
will impact LEA instruction selection. */
DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
- | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL)
+ | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
/* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */
DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
- m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
+ m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS
| m_KNL | m_KNM)
/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
@@ -263,7 +278,7 @@ DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
a conditional move. */
DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL
- | m_KNM | m_TREMONT | m_INTEL)
+ | m_KNM | m_INTEL)
/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */
@@ -273,7 +288,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
move/set sequences of bytes with known size. */
DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
"prefer_known_rep_movsb_stosb",
- m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
+ m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
compact prologues and epilogues by issuing a misaligned moves. This
@@ -282,7 +297,8 @@ DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
FIXME: This may actualy be a win on more targets than listed here. */
DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
"misaligned_move_string_pro_epilogues",
- m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
+ m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_TREMONT
+ | m_GENERIC)
/* X86_TUNE_USE_SAHF: Controls use of SAHF. */
DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
@@ -294,7 +310,7 @@ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */
DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
- | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT))
+ | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS))
/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */
DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
@@ -305,7 +321,7 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
for bit-manipulation instructions. */
DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
- m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
+ m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_GENERIC)
/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
on hardware capabilities. Bdver3 hardware has a loop buffer which makes
@@ -321,14 +337,14 @@ DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
/* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence. */
DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
- m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC)
+ m_CORE_ALL | m_BDVER | m_ZNVER | m_TREMONT | m_GENERIC)
/* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
(signed) x >> (W-1)) instead of cmove or SSE max/abs instructions. */
DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
m_CORE_ALL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
- | m_GOLDMONT_PLUS | m_TREMONT )
+ | m_GOLDMONT_PLUS)
/*****************************************************************************/
/* 387 instruction selection tuning */
@@ -386,13 +402,13 @@ DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optim
/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */
DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
- m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
+ m_AMD_MULTIPLE | m_CORE_ALL | m_TREMONT | m_GENERIC)
/* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
xorps/xorpd and other variants. */
DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
- | m_GENERIC)
+ | m_TREMONT | m_GENERIC)
/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
to SSE registers. If disabled, the moves will be done by storing
@@ -419,7 +435,7 @@ DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
fp converts to destination register. */
DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS
- | m_TREMONT | m_INTEL)
+ | m_INTEL)
/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
from FP to FP. This form of instructions avoids partial write to the
@@ -434,7 +450,7 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */
DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
- | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL)
+ | m_GOLDMONT_PLUS | m_INTEL)
/* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */
DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
diff --git a/gcc/config/mips/netbsd.h b/gcc/config/mips/netbsd.h
index 85c2779..1c6a59d 100644
--- a/gcc/config/mips/netbsd.h
+++ b/gcc/config/mips/netbsd.h
@@ -87,7 +87,7 @@ along with GCC; see the file COPYING3. If not see
else if (mips_isa >= MIPS_ISA_MIPS32 \
&& mips_isa < MIPS_ISA_MIPS64) \
builtin_define ("__mips=32"); \
- else if (mips_isa >= MIPS_ISA_64) \
+ else if (mips_isa >= MIPS_ISA_MIPS64) \
builtin_define ("__mips=64"); \
if (mips_isa_rev > 0) \
builtin_define_with_int_value ("__mips_isa_rev", \
diff --git a/gcc/config/rs6000/lynx.h b/gcc/config/rs6000/lynx.h
index 3434c8b..0ddb54f 100644
--- a/gcc/config/rs6000/lynx.h
+++ b/gcc/config/rs6000/lynx.h
@@ -80,7 +80,6 @@
#undef SIZE_TYPE
#undef ASM_OUTPUT_ALIGN
-#undef PREFERRED_DEBUGGING_TYPE
/* The file rs6000.c defines TARGET_HAVE_TLS unconditionally to the
value of HAVE_AS_TLS. HAVE_AS_TLS is true as gas support for TLS
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 1f6fc03..1990a21 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -91,7 +91,10 @@
UNSPEC_MMA_XVI8GER4SPP
UNSPEC_MMA_XXMFACC
UNSPEC_MMA_XXMTACC
- UNSPEC_MMA_XXSETACCZ
+ ])
+
+(define_c_enum "unspecv"
+ [UNSPECV_MMA_XXSETACCZ
])
;; MMA instructions with 1 accumulator argument
@@ -467,30 +470,16 @@
"<acc> %A0"
[(set_attr "type" "mma")])
-;; We can't have integer constants in XOmode so we wrap this in an UNSPEC.
-
-(define_expand "mma_xxsetaccz"
- [(set (match_operand:XO 0 "fpr_reg_operand")
- (const_int 0))]
- "TARGET_MMA"
-{
- rtx xo0 = gen_rtx_UNSPEC (XOmode, gen_rtvec (1, const0_rtx),
- UNSPEC_MMA_XXSETACCZ);
- emit_insn (gen_rtx_SET (operands[0], xo0));
- DONE;
-})
+;; We can't have integer constants in XOmode so we wrap this in an
+;; UNSPEC_VOLATILE.
-(define_insn_and_split "*mma_xxsetaccz"
+(define_insn "mma_xxsetaccz"
[(set (match_operand:XO 0 "fpr_reg_operand" "=d")
- (unspec:XO [(match_operand 1 "const_0_to_1_operand" "O")]
- UNSPEC_MMA_XXSETACCZ))]
+ (unspec_volatile:XO [(const_int 0)]
+ UNSPECV_MMA_XXSETACCZ))]
"TARGET_MMA"
"xxsetaccz %A0"
- "&& reload_completed"
- [(set (match_dup 0) (unspec:XO [(match_dup 1)] UNSPEC_MMA_XXSETACCZ))]
- ""
- [(set_attr "type" "mma")
- (set_attr "length" "4")])
+ [(set_attr "type" "mma")])
(define_insn "mma_<vv>"
[(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def
index 6a28d51..a8c6b9e 100644
--- a/gcc/config/rs6000/rs6000-builtin-new.def
+++ b/gcc/config/rs6000/rs6000-builtin-new.def
@@ -208,6 +208,12 @@
double __builtin_mffs ();
MFFS rs6000_mffs {}
+; Although the mffsl instruction is only available on POWER9 and later
+; processors, this builtin automatically falls back to mffs on older
+; platforms. Thus it appears here in the [always] stanza.
+ double __builtin_mffsl ();
+ MFFSL rs6000_mffsl {}
+
; This thing really assumes long double == __ibm128, and I'm told it has
; been used as such within libgcc. Given that __builtin_pack_ibm128
; exists for the same purpose, this should really not be used at all.
@@ -2784,9 +2790,6 @@
signed long long __builtin_darn_raw ();
DARN_RAW darn_raw {}
- double __builtin_mffsl ();
- MFFSL rs6000_mffsl {}
-
const signed int __builtin_dtstsfi_eq_dd (const int<6>, _Decimal64);
TSTSFI_EQ_DD dfptstsfi_eq_dd {}
diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index afcb5bb..d08bdfe 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -35,6 +35,9 @@
#include "langhooks.h"
#include "c/c-tree.h"
+#include "rs6000-builtins.h"
+
+static tree altivec_resolve_new_overloaded_builtin (location_t, tree, void *);
/* Handle the machine specific pragma longcall. Its syntax is
@@ -811,6 +814,32 @@ is_float128_p (tree t)
&& t == long_double_type_node));
}
+
+/* Return true iff ARGTYPE can be compatibly passed as PARMTYPE. */
+static bool
+rs6000_new_builtin_type_compatible (tree parmtype, tree argtype)
+{
+ if (parmtype == error_mark_node)
+ return false;
+
+ if (INTEGRAL_TYPE_P (parmtype) && INTEGRAL_TYPE_P (argtype))
+ return true;
+
+ if (TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128
+ && is_float128_p (parmtype) && is_float128_p (argtype))
+ return true;
+
+ if (POINTER_TYPE_P (parmtype) && POINTER_TYPE_P (argtype))
+ {
+ parmtype = TREE_TYPE (parmtype);
+ argtype = TREE_TYPE (argtype);
+ if (TYPE_READONLY (argtype))
+ parmtype = build_qualified_type (parmtype, TYPE_QUAL_CONST);
+ }
+
+ return lang_hooks.types_compatible_p (parmtype, argtype);
+}
+
static inline bool
rs6000_builtin_type_compatible (tree t, int id)
{
@@ -927,6 +956,10 @@ tree
altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
void *passed_arglist)
{
+ if (new_builtins_are_live)
+ return altivec_resolve_new_overloaded_builtin (loc, fndecl,
+ passed_arglist);
+
vec<tree, va_gc> *arglist = static_cast<vec<tree, va_gc> *> (passed_arglist);
unsigned int nargs = vec_safe_length (arglist);
enum rs6000_builtins fcode
@@ -1930,3 +1963,1048 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
return error_mark_node;
}
}
+
+/* Build a tree for a function call to an Altivec non-overloaded builtin.
+ The overloaded builtin that matched the types and args is described
+ by DESC. The N arguments are given in ARGS, respectively.
+
+ Actually the only thing it does is calling fold_convert on ARGS, with
+ a small exception for vec_{all,any}_{ge,le} predicates. */
+
+static tree
+altivec_build_new_resolved_builtin (tree *args, int n, tree fntype,
+ tree ret_type,
+ rs6000_gen_builtins bif_id,
+ rs6000_gen_builtins ovld_id)
+{
+ tree argtypes = TYPE_ARG_TYPES (fntype);
+ tree arg_type[MAX_OVLD_ARGS];
+ tree fndecl = rs6000_builtin_decls_x[bif_id];
+
+ for (int i = 0; i < n; i++)
+ {
+ arg_type[i] = TREE_VALUE (argtypes);
+ argtypes = TREE_CHAIN (argtypes);
+ }
+
+ /* The AltiVec overloading implementation is overall gross, but this
+ is particularly disgusting. The vec_{all,any}_{ge,le} builtins
+ are completely different for floating-point vs. integer vector
+ types, because the former has vcmpgefp, but the latter should use
+ vcmpgtXX.
+
+ In practice, the second and third arguments are swapped, and the
+ condition (LT vs. EQ, which is recognizable by bit 1 of the first
+ argument) is reversed. Patch the arguments here before building
+ the resolved CALL_EXPR. */
+ if (n == 3
+ && ovld_id == RS6000_OVLD_VEC_CMPGE_P
+ && bif_id != RS6000_BIF_VCMPGEFP_P
+ && bif_id != RS6000_BIF_XVCMPGEDP_P)
+ {
+ std::swap (args[1], args[2]);
+ std::swap (arg_type[1], arg_type[2]);
+
+ args[0] = fold_build2 (BIT_XOR_EXPR, TREE_TYPE (args[0]), args[0],
+ build_int_cst (NULL_TREE, 2));
+ }
+
+ for (int j = 0; j < n; j++)
+ args[j] = fully_fold_convert (arg_type[j], args[j]);
+
+ /* If the number of arguments to an overloaded function increases,
+ we must expand this switch. */
+ gcc_assert (MAX_OVLD_ARGS <= 4);
+
+ tree call;
+ switch (n)
+ {
+ case 0:
+ call = build_call_expr (fndecl, 0);
+ break;
+ case 1:
+ call = build_call_expr (fndecl, 1, args[0]);
+ break;
+ case 2:
+ call = build_call_expr (fndecl, 2, args[0], args[1]);
+ break;
+ case 3:
+ call = build_call_expr (fndecl, 3, args[0], args[1], args[2]);
+ break;
+ case 4:
+ call = build_call_expr (fndecl, 4, args[0], args[1], args[2], args[3]);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ return fold_convert (ret_type, call);
+}
+
+/* Implementation of the resolve_overloaded_builtin target hook, to
+ support Altivec's overloaded builtins. FIXME: This code needs
+ to be brutally factored. */
+
+static tree
+altivec_resolve_new_overloaded_builtin (location_t loc, tree fndecl,
+ void *passed_arglist)
+{
+ vec<tree, va_gc> *arglist = static_cast<vec<tree, va_gc> *> (passed_arglist);
+ unsigned int nargs = vec_safe_length (arglist);
+ enum rs6000_gen_builtins fcode
+ = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+ tree fnargs = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
+ tree types[MAX_OVLD_ARGS];
+ tree args[MAX_OVLD_ARGS];
+
+ /* Return immediately if this isn't an overload. */
+ if (fcode <= RS6000_OVLD_NONE)
+ return NULL_TREE;
+
+ unsigned int adj_fcode = fcode - RS6000_OVLD_NONE;
+
+ if (TARGET_DEBUG_BUILTIN)
+ fprintf (stderr, "altivec_resolve_overloaded_builtin, code = %4d, %s\n",
+ (int) fcode, IDENTIFIER_POINTER (DECL_NAME (fndecl)));
+
+ /* vec_lvsl and vec_lvsr are deprecated for use with LE element order. */
+ if (fcode == RS6000_OVLD_VEC_LVSL && !BYTES_BIG_ENDIAN)
+ warning (OPT_Wdeprecated,
+ "%<vec_lvsl%> is deprecated for little endian; use "
+ "assignment for unaligned loads and stores");
+ else if (fcode == RS6000_OVLD_VEC_LVSR && !BYTES_BIG_ENDIAN)
+ warning (OPT_Wdeprecated,
+ "%<vec_lvsr%> is deprecated for little endian; use "
+ "assignment for unaligned loads and stores");
+
+ if (fcode == RS6000_OVLD_VEC_MUL)
+ {
+ /* vec_mul needs to be special cased because there are no instructions
+ for it for the {un}signed char, {un}signed short, and {un}signed int
+ types. */
+ if (nargs != 2)
+ {
+ error ("builtin %qs only accepts 2 arguments", "vec_mul");
+ return error_mark_node;
+ }
+
+ tree arg0 = (*arglist)[0];
+ tree arg0_type = TREE_TYPE (arg0);
+ tree arg1 = (*arglist)[1];
+ tree arg1_type = TREE_TYPE (arg1);
+
+ /* Both arguments must be vectors and the types must be compatible. */
+ if (TREE_CODE (arg0_type) != VECTOR_TYPE)
+ goto bad;
+ if (!lang_hooks.types_compatible_p (arg0_type, arg1_type))
+ goto bad;
+
+ switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+ {
+ case E_QImode:
+ case E_HImode:
+ case E_SImode:
+ case E_DImode:
+ case E_TImode:
+ {
+ /* For scalar types just use a multiply expression. */
+ return fold_build2_loc (loc, MULT_EXPR, TREE_TYPE (arg0), arg0,
+ fold_convert (TREE_TYPE (arg0), arg1));
+ }
+ case E_SFmode:
+ {
+ /* For floats use the xvmulsp instruction directly. */
+ tree call = rs6000_builtin_decls_x[RS6000_BIF_XVMULSP];
+ return build_call_expr (call, 2, arg0, arg1);
+ }
+ case E_DFmode:
+ {
+ /* For doubles use the xvmuldp instruction directly. */
+ tree call = rs6000_builtin_decls_x[RS6000_BIF_XVMULDP];
+ return build_call_expr (call, 2, arg0, arg1);
+ }
+ /* Other types are errors. */
+ default:
+ goto bad;
+ }
+ }
+
+ if (fcode == RS6000_OVLD_VEC_CMPNE)
+ {
+ /* vec_cmpne needs to be special cased because there are no instructions
+ for it (prior to power 9). */
+ if (nargs != 2)
+ {
+ error ("builtin %qs only accepts 2 arguments", "vec_cmpne");
+ return error_mark_node;
+ }
+
+ tree arg0 = (*arglist)[0];
+ tree arg0_type = TREE_TYPE (arg0);
+ tree arg1 = (*arglist)[1];
+ tree arg1_type = TREE_TYPE (arg1);
+
+ /* Both arguments must be vectors and the types must be compatible. */
+ if (TREE_CODE (arg0_type) != VECTOR_TYPE)
+ goto bad;
+ if (!lang_hooks.types_compatible_p (arg0_type, arg1_type))
+ goto bad;
+
+ /* Power9 instructions provide the most efficient implementation of
+ ALTIVEC_BUILTIN_VEC_CMPNE if the mode is not DImode or TImode
+ or SFmode or DFmode. */
+ if (!TARGET_P9_VECTOR
+ || (TYPE_MODE (TREE_TYPE (arg0_type)) == DImode)
+ || (TYPE_MODE (TREE_TYPE (arg0_type)) == TImode)
+ || (TYPE_MODE (TREE_TYPE (arg0_type)) == SFmode)
+ || (TYPE_MODE (TREE_TYPE (arg0_type)) == DFmode))
+ {
+ switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+ {
+ /* vec_cmpneq (va, vb) == vec_nor (vec_cmpeq (va, vb),
+ vec_cmpeq (va, vb)). */
+ /* Note: vec_nand also works but opt changes vec_nand's
+ to vec_nor's anyway. */
+ case E_QImode:
+ case E_HImode:
+ case E_SImode:
+ case E_DImode:
+ case E_TImode:
+ case E_SFmode:
+ case E_DFmode:
+ {
+ /* call = vec_cmpeq (va, vb)
+ result = vec_nor (call, call). */
+ vec<tree, va_gc> *params = make_tree_vector ();
+ vec_safe_push (params, arg0);
+ vec_safe_push (params, arg1);
+ tree call = altivec_resolve_new_overloaded_builtin
+ (loc, rs6000_builtin_decls_x[RS6000_OVLD_VEC_CMPEQ],
+ params);
+ /* Use save_expr to ensure that operands used more than once
+ that may have side effects (like calls) are only evaluated
+ once. */
+ call = save_expr (call);
+ params = make_tree_vector ();
+ vec_safe_push (params, call);
+ vec_safe_push (params, call);
+ return altivec_resolve_new_overloaded_builtin
+ (loc, rs6000_builtin_decls_x[RS6000_OVLD_VEC_NOR], params);
+ }
+ /* Other types are errors. */
+ default:
+ goto bad;
+ }
+ }
+ /* else, fall through and process the Power9 alternative below */
+ }
+
+ if (fcode == RS6000_OVLD_VEC_ADDE || fcode == RS6000_OVLD_VEC_SUBE)
+ {
+ /* vec_adde needs to be special cased because there is no instruction
+ for the {un}signed int version. */
+ if (nargs != 3)
+ {
+ const char *name;
+ name = fcode == RS6000_OVLD_VEC_ADDE ? "vec_adde" : "vec_sube";
+ error ("builtin %qs only accepts 3 arguments", name);
+ return error_mark_node;
+ }
+
+ tree arg0 = (*arglist)[0];
+ tree arg0_type = TREE_TYPE (arg0);
+ tree arg1 = (*arglist)[1];
+ tree arg1_type = TREE_TYPE (arg1);
+ tree arg2 = (*arglist)[2];
+ tree arg2_type = TREE_TYPE (arg2);
+
+ /* All 3 arguments must be vectors of (signed or unsigned) (int or
+ __int128) and the types must be compatible. */
+ if (TREE_CODE (arg0_type) != VECTOR_TYPE)
+ goto bad;
+ if (!lang_hooks.types_compatible_p (arg0_type, arg1_type)
+ || !lang_hooks.types_compatible_p (arg1_type, arg2_type))
+ goto bad;
+
+ switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+ {
+ /* For {un}signed ints,
+ vec_adde (va, vb, carryv) == vec_add (vec_add (va, vb),
+ vec_and (carryv, 1)).
+ vec_sube (va, vb, carryv) == vec_sub (vec_sub (va, vb),
+ vec_and (carryv, 1)). */
+ case E_SImode:
+ {
+ tree add_sub_builtin;
+
+ vec<tree, va_gc> *params = make_tree_vector ();
+ vec_safe_push (params, arg0);
+ vec_safe_push (params, arg1);
+
+ if (fcode == RS6000_OVLD_VEC_ADDE)
+ add_sub_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_ADD];
+ else
+ add_sub_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_SUB];
+
+ tree call
+ = altivec_resolve_new_overloaded_builtin (loc,
+ add_sub_builtin,
+ params);
+ tree const1 = build_int_cstu (TREE_TYPE (arg0_type), 1);
+ tree ones_vector = build_vector_from_val (arg0_type, const1);
+ tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, arg0_type,
+ arg2, ones_vector);
+ params = make_tree_vector ();
+ vec_safe_push (params, call);
+ vec_safe_push (params, and_expr);
+ return altivec_resolve_new_overloaded_builtin (loc,
+ add_sub_builtin,
+ params);
+ }
+ /* For {un}signed __int128s use the vaddeuqm/vsubeuqm instruction
+ directly. */
+ case E_TImode:
+ break;
+
+ /* Types other than {un}signed int and {un}signed __int128
+ are errors. */
+ default:
+ goto bad;
+ }
+ }
+
+ if (fcode == RS6000_OVLD_VEC_ADDEC || fcode == RS6000_OVLD_VEC_SUBEC)
+ {
+ /* vec_addec and vec_subec needs to be special cased because there is
+ no instruction for the {un}signed int version. */
+ if (nargs != 3)
+ {
+ const char *name;
+ name = fcode == RS6000_OVLD_VEC_ADDEC ? "vec_addec" : "vec_subec";
+ error ("builtin %qs only accepts 3 arguments", name);
+ return error_mark_node;
+ }
+
+ tree arg0 = (*arglist)[0];
+ tree arg0_type = TREE_TYPE (arg0);
+ tree arg1 = (*arglist)[1];
+ tree arg1_type = TREE_TYPE (arg1);
+ tree arg2 = (*arglist)[2];
+ tree arg2_type = TREE_TYPE (arg2);
+
+ /* All 3 arguments must be vectors of (signed or unsigned) (int or
+ __int128) and the types must be compatible. */
+ if (TREE_CODE (arg0_type) != VECTOR_TYPE)
+ goto bad;
+ if (!lang_hooks.types_compatible_p (arg0_type, arg1_type)
+ || !lang_hooks.types_compatible_p (arg1_type, arg2_type))
+ goto bad;
+
+ switch (TYPE_MODE (TREE_TYPE (arg0_type)))
+ {
+ /* For {un}signed ints,
+ vec_addec (va, vb, carryv) ==
+ vec_or (vec_addc (va, vb),
+ vec_addc (vec_add (va, vb),
+ vec_and (carryv, 0x1))). */
+ case E_SImode:
+ {
+ /* Use save_expr to ensure that operands used more than once
+ that may have side effects (like calls) are only evaluated
+ once. */
+ tree as_builtin;
+ tree as_c_builtin;
+
+ arg0 = save_expr (arg0);
+ arg1 = save_expr (arg1);
+ vec<tree, va_gc> *params = make_tree_vector ();
+ vec_safe_push (params, arg0);
+ vec_safe_push (params, arg1);
+
+ if (fcode == RS6000_OVLD_VEC_ADDEC)
+ as_c_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_ADDC];
+ else
+ as_c_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_SUBC];
+
+ tree call1 = altivec_resolve_new_overloaded_builtin (loc,
+ as_c_builtin,
+ params);
+ params = make_tree_vector ();
+ vec_safe_push (params, arg0);
+ vec_safe_push (params, arg1);
+
+ if (fcode == RS6000_OVLD_VEC_ADDEC)
+ as_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_ADD];
+ else
+ as_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_SUB];
+
+ tree call2 = altivec_resolve_new_overloaded_builtin (loc,
+ as_builtin,
+ params);
+ tree const1 = build_int_cstu (TREE_TYPE (arg0_type), 1);
+ tree ones_vector = build_vector_from_val (arg0_type, const1);
+ tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, arg0_type,
+ arg2, ones_vector);
+ params = make_tree_vector ();
+ vec_safe_push (params, call2);
+ vec_safe_push (params, and_expr);
+ call2 = altivec_resolve_new_overloaded_builtin (loc, as_c_builtin,
+ params);
+ params = make_tree_vector ();
+ vec_safe_push (params, call1);
+ vec_safe_push (params, call2);
+ tree or_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_OR];
+ return altivec_resolve_new_overloaded_builtin (loc, or_builtin,
+ params);
+ }
+ /* For {un}signed __int128s use the vaddecuq/vsubbecuq
+ instructions. This occurs through normal processing. */
+ case E_TImode:
+ break;
+
+ /* Types other than {un}signed int and {un}signed __int128
+ are errors. */
+ default:
+ goto bad;
+ }
+ }
+
+ /* For now treat vec_splats and vec_promote as the same. */
+ if (fcode == RS6000_OVLD_VEC_SPLATS || fcode == RS6000_OVLD_VEC_PROMOTE)
+ {
+ tree type, arg;
+ int size;
+ int i;
+ bool unsigned_p;
+ vec<constructor_elt, va_gc> *vec;
+ const char *name;
+ name = fcode == RS6000_OVLD_VEC_SPLATS ? "vec_splats" : "vec_promote";
+
+ if (fcode == RS6000_OVLD_VEC_SPLATS && nargs != 1)
+ {
+ error ("builtin %qs only accepts 1 argument", name);
+ return error_mark_node;
+ }
+ if (fcode == RS6000_OVLD_VEC_PROMOTE && nargs != 2)
+ {
+ error ("builtin %qs only accepts 2 arguments", name);
+ return error_mark_node;
+ }
+ /* Ignore promote's element argument. */
+ if (fcode == RS6000_OVLD_VEC_PROMOTE
+ && !INTEGRAL_TYPE_P (TREE_TYPE ((*arglist)[1])))
+ goto bad;
+
+ arg = (*arglist)[0];
+ type = TREE_TYPE (arg);
+ if (!SCALAR_FLOAT_TYPE_P (type)
+ && !INTEGRAL_TYPE_P (type))
+ goto bad;
+ unsigned_p = TYPE_UNSIGNED (type);
+ switch (TYPE_MODE (type))
+ {
+ case E_TImode:
+ type = unsigned_p ? unsigned_V1TI_type_node : V1TI_type_node;
+ size = 1;
+ break;
+ case E_DImode:
+ type = unsigned_p ? unsigned_V2DI_type_node : V2DI_type_node;
+ size = 2;
+ break;
+ case E_SImode:
+ type = unsigned_p ? unsigned_V4SI_type_node : V4SI_type_node;
+ size = 4;
+ break;
+ case E_HImode:
+ type = unsigned_p ? unsigned_V8HI_type_node : V8HI_type_node;
+ size = 8;
+ break;
+ case E_QImode:
+ type = unsigned_p ? unsigned_V16QI_type_node : V16QI_type_node;
+ size = 16;
+ break;
+ case E_SFmode:
+ type = V4SF_type_node;
+ size = 4;
+ break;
+ case E_DFmode:
+ type = V2DF_type_node;
+ size = 2;
+ break;
+ default:
+ goto bad;
+ }
+ arg = save_expr (fold_convert (TREE_TYPE (type), arg));
+ vec_alloc (vec, size);
+ for (i = 0; i < size; i++)
+ {
+ constructor_elt elt = {NULL_TREE, arg};
+ vec->quick_push (elt);
+ }
+ return build_constructor (type, vec);
+ }
+
+ /* For now use pointer tricks to do the extraction, unless we are on VSX
+ extracting a double from a constant offset. */
+ if (fcode == RS6000_OVLD_VEC_EXTRACT)
+ {
+ tree arg1;
+ tree arg1_type;
+ tree arg2;
+ tree arg1_inner_type;
+ tree decl, stmt;
+ tree innerptrtype;
+ machine_mode mode;
+
+ /* No second argument. */
+ if (nargs != 2)
+ {
+ error ("builtin %qs only accepts 2 arguments", "vec_extract");
+ return error_mark_node;
+ }
+
+ arg2 = (*arglist)[1];
+ arg1 = (*arglist)[0];
+ arg1_type = TREE_TYPE (arg1);
+
+ if (TREE_CODE (arg1_type) != VECTOR_TYPE)
+ goto bad;
+ if (!INTEGRAL_TYPE_P (TREE_TYPE (arg2)))
+ goto bad;
+
+ /* See if we can optimize vec_extracts with the current VSX instruction
+ set. */
+ mode = TYPE_MODE (arg1_type);
+ if (VECTOR_MEM_VSX_P (mode))
+
+ {
+ tree call = NULL_TREE;
+ int nunits = GET_MODE_NUNITS (mode);
+
+ arg2 = fold_for_warn (arg2);
+
+ /* If the second argument is an integer constant, generate
+ the built-in code if we can. We need 64-bit and direct
+ move to extract the small integer vectors. */
+ if (TREE_CODE (arg2) == INTEGER_CST)
+ {
+ wide_int selector = wi::to_wide (arg2);
+ selector = wi::umod_trunc (selector, nunits);
+ arg2 = wide_int_to_tree (TREE_TYPE (arg2), selector);
+ switch (mode)
+ {
+ default:
+ break;
+
+ case E_V1TImode:
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V1TI];
+ break;
+
+ case E_V2DFmode:
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DF];
+ break;
+
+ case E_V2DImode:
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DI];
+ break;
+
+ case E_V4SFmode:
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SF];
+ break;
+
+ case E_V4SImode:
+ if (TARGET_DIRECT_MOVE_64BIT)
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SI];
+ break;
+
+ case E_V8HImode:
+ if (TARGET_DIRECT_MOVE_64BIT)
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V8HI];
+ break;
+
+ case E_V16QImode:
+ if (TARGET_DIRECT_MOVE_64BIT)
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V16QI];
+ break;
+ }
+ }
+
+ /* If the second argument is variable, we can optimize it if we are
+ generating 64-bit code on a machine with direct move. */
+ else if (TREE_CODE (arg2) != INTEGER_CST && TARGET_DIRECT_MOVE_64BIT)
+ {
+ switch (mode)
+ {
+ default:
+ break;
+
+ case E_V2DFmode:
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DF];
+ break;
+
+ case E_V2DImode:
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DI];
+ break;
+
+ case E_V4SFmode:
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SF];
+ break;
+
+ case E_V4SImode:
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SI];
+ break;
+
+ case E_V8HImode:
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V8HI];
+ break;
+
+ case E_V16QImode:
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V16QI];
+ break;
+ }
+ }
+
+ if (call)
+ {
+ tree result = build_call_expr (call, 2, arg1, arg2);
+ /* Coerce the result to vector element type. May be no-op. */
+ arg1_inner_type = TREE_TYPE (arg1_type);
+ result = fold_convert (arg1_inner_type, result);
+ return result;
+ }
+ }
+
+ /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2). */
+ arg1_inner_type = TREE_TYPE (arg1_type);
+ tree subp = build_int_cst (TREE_TYPE (arg2),
+ TYPE_VECTOR_SUBPARTS (arg1_type) - 1);
+ arg2 = build_binary_op (loc, BIT_AND_EXPR, arg2, subp, 0);
+ decl = build_decl (loc, VAR_DECL, NULL_TREE, arg1_type);
+ DECL_EXTERNAL (decl) = 0;
+ TREE_PUBLIC (decl) = 0;
+ DECL_CONTEXT (decl) = current_function_decl;
+ TREE_USED (decl) = 1;
+ TREE_TYPE (decl) = arg1_type;
+ TREE_READONLY (decl) = TYPE_READONLY (arg1_type);
+ if (c_dialect_cxx ())
+ {
+ stmt = build4 (TARGET_EXPR, arg1_type, decl, arg1,
+ NULL_TREE, NULL_TREE);
+ SET_EXPR_LOCATION (stmt, loc);
+ }
+ else
+ {
+ DECL_INITIAL (decl) = arg1;
+ stmt = build1 (DECL_EXPR, arg1_type, decl);
+ TREE_ADDRESSABLE (decl) = 1;
+ SET_EXPR_LOCATION (stmt, loc);
+ stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
+ }
+
+ innerptrtype = build_pointer_type (arg1_inner_type);
+
+ stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
+ stmt = convert (innerptrtype, stmt);
+ stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
+ stmt = build_indirect_ref (loc, stmt, RO_NULL);
+
+ /* PR83660: We mark this as having side effects so that
+ downstream in fold_build_cleanup_point_expr () it will get a
+ CLEANUP_POINT_EXPR. If it does not we can run into an ICE
+ later in gimplify_cleanup_point_expr (). Potentially this
+ causes missed optimization because there actually is no side
+ effect. */
+ if (c_dialect_cxx ())
+ TREE_SIDE_EFFECTS (stmt) = 1;
+
+ return stmt;
+ }
+
+ /* For now use pointer tricks to do the insertion, unless we are on VSX
+ inserting a double to a constant offset. */
+ if (fcode == RS6000_OVLD_VEC_INSERT)
+ {
+ tree arg0;
+ tree arg1;
+ tree arg2;
+ tree arg1_type;
+ tree decl, stmt;
+ machine_mode mode;
+
+ /* No second or third arguments. */
+ if (nargs != 3)
+ {
+ error ("builtin %qs only accepts 3 arguments", "vec_insert");
+ return error_mark_node;
+ }
+
+ arg0 = (*arglist)[0];
+ arg1 = (*arglist)[1];
+ arg1_type = TREE_TYPE (arg1);
+ arg2 = fold_for_warn ((*arglist)[2]);
+
+ if (TREE_CODE (arg1_type) != VECTOR_TYPE)
+ goto bad;
+ if (!INTEGRAL_TYPE_P (TREE_TYPE (arg2)))
+ goto bad;
+
+ /* If we can use the VSX xxpermdi instruction, use that for insert. */
+ mode = TYPE_MODE (arg1_type);
+ if ((mode == V2DFmode || mode == V2DImode) && VECTOR_UNIT_VSX_P (mode)
+ && TREE_CODE (arg2) == INTEGER_CST)
+ {
+ wide_int selector = wi::to_wide (arg2);
+ selector = wi::umod_trunc (selector, 2);
+ tree call = NULL_TREE;
+
+ arg2 = wide_int_to_tree (TREE_TYPE (arg2), selector);
+ if (mode == V2DFmode)
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_SET_V2DF];
+ else if (mode == V2DImode)
+ call = rs6000_builtin_decls_x[RS6000_BIF_VEC_SET_V2DI];
+
+ /* Note, __builtin_vec_insert_<xxx> has vector and scalar types
+ reversed. */
+ if (call)
+ return build_call_expr (call, 3, arg1, arg0, arg2);
+ }
+ else if (mode == V1TImode && VECTOR_UNIT_VSX_P (mode)
+ && TREE_CODE (arg2) == INTEGER_CST)
+ {
+ tree call = rs6000_builtin_decls_x[RS6000_BIF_VEC_SET_V1TI];
+ wide_int selector = wi::zero(32);
+
+ arg2 = wide_int_to_tree (TREE_TYPE (arg2), selector);
+ /* Note, __builtin_vec_insert_<xxx> has vector and scalar types
+ reversed. */
+ return build_call_expr (call, 3, arg1, arg0, arg2);
+ }
+
+ /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0 with
+ VIEW_CONVERT_EXPR. i.e.:
+ D.3192 = v1;
+ _1 = n & 3;
+ VIEW_CONVERT_EXPR<int[4]>(D.3192)[_1] = i;
+ v1 = D.3192;
+ D.3194 = v1; */
+ if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1)
+ arg2 = build_int_cst (TREE_TYPE (arg2), 0);
+ else
+ arg2 = build_binary_op (loc, BIT_AND_EXPR, arg2,
+ build_int_cst (TREE_TYPE (arg2),
+ TYPE_VECTOR_SUBPARTS (arg1_type)
+ - 1), 0);
+ decl = build_decl (loc, VAR_DECL, NULL_TREE, arg1_type);
+ DECL_EXTERNAL (decl) = 0;
+ TREE_PUBLIC (decl) = 0;
+ DECL_CONTEXT (decl) = current_function_decl;
+ TREE_USED (decl) = 1;
+ TREE_TYPE (decl) = arg1_type;
+ TREE_READONLY (decl) = TYPE_READONLY (arg1_type);
+ TREE_ADDRESSABLE (decl) = 1;
+ if (c_dialect_cxx ())
+ {
+ stmt = build4 (TARGET_EXPR, arg1_type, decl, arg1,
+ NULL_TREE, NULL_TREE);
+ SET_EXPR_LOCATION (stmt, loc);
+ }
+ else
+ {
+ DECL_INITIAL (decl) = arg1;
+ stmt = build1 (DECL_EXPR, arg1_type, decl);
+ SET_EXPR_LOCATION (stmt, loc);
+ stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
+ }
+
+ if (TARGET_VSX)
+ {
+ stmt = build_array_ref (loc, stmt, arg2);
+ stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt,
+ convert (TREE_TYPE (stmt), arg0));
+ stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
+ }
+ else
+ {
+ tree arg1_inner_type;
+ tree innerptrtype;
+ arg1_inner_type = TREE_TYPE (arg1_type);
+ innerptrtype = build_pointer_type (arg1_inner_type);
+
+ stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
+ stmt = convert (innerptrtype, stmt);
+ stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
+ stmt = build_indirect_ref (loc, stmt, RO_NULL);
+ stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt,
+ convert (TREE_TYPE (stmt), arg0));
+ stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
+ }
+ return stmt;
+ }
+
+ unsigned int n;
+ for (n = 0;
+ !VOID_TYPE_P (TREE_VALUE (fnargs)) && n < nargs;
+ fnargs = TREE_CHAIN (fnargs), n++)
+ {
+ tree decl_type = TREE_VALUE (fnargs);
+ tree arg = (*arglist)[n];
+ tree type;
+
+ if (arg == error_mark_node)
+ return error_mark_node;
+
+ if (n >= MAX_OVLD_ARGS)
+ abort ();
+
+ arg = default_conversion (arg);
+
+ /* The C++ front-end converts float * to const void * using
+ NOP_EXPR<const void *> (NOP_EXPR<void *> (x)). */
+ type = TREE_TYPE (arg);
+ if (POINTER_TYPE_P (type)
+ && TREE_CODE (arg) == NOP_EXPR
+ && lang_hooks.types_compatible_p (TREE_TYPE (arg),
+ const_ptr_type_node)
+ && lang_hooks.types_compatible_p (TREE_TYPE (TREE_OPERAND (arg, 0)),
+ ptr_type_node))
+ {
+ arg = TREE_OPERAND (arg, 0);
+ type = TREE_TYPE (arg);
+ }
+
+ /* Remove the const from the pointers to simplify the overload
+ matching further down. */
+ if (POINTER_TYPE_P (decl_type)
+ && POINTER_TYPE_P (type)
+ && TYPE_QUALS (TREE_TYPE (type)) != 0)
+ {
+ if (TYPE_READONLY (TREE_TYPE (type))
+ && !TYPE_READONLY (TREE_TYPE (decl_type)))
+ warning (0, "passing argument %d of %qE discards const qualifier "
+ "from pointer target type", n + 1, fndecl);
+ type = build_qualified_type (TREE_TYPE (type), 0);
+ type = build_pointer_type (type);
+ arg = fold_convert (type, arg);
+ }
+
+ /* For RS6000_OVLD_VEC_LXVL, convert any const * to its non constant
+ equivalent to simplify the overload matching below. */
+ if (fcode == RS6000_OVLD_VEC_LXVL)
+ {
+ if (POINTER_TYPE_P (type)
+ && TYPE_READONLY (TREE_TYPE (type)))
+ {
+ type = build_qualified_type (TREE_TYPE (type), 0);
+ type = build_pointer_type (type);
+ arg = fold_convert (type, arg);
+ }
+ }
+
+ args[n] = arg;
+ types[n] = type;
+ }
+
+ /* If the number of arguments did not match the prototype, return NULL
+ and the generic code will issue the appropriate error message. */
+ if (!VOID_TYPE_P (TREE_VALUE (fnargs)) || n < nargs)
+ return NULL;
+
+ if (fcode == RS6000_OVLD_VEC_STEP)
+ {
+ if (TREE_CODE (types[0]) != VECTOR_TYPE)
+ goto bad;
+
+ return build_int_cst (NULL_TREE, TYPE_VECTOR_SUBPARTS (types[0]));
+ }
+
+ {
+ bool unsupported_builtin = false;
+ enum rs6000_gen_builtins overloaded_code;
+ bool supported = false;
+ ovlddata *instance = rs6000_overload_info[adj_fcode].first_instance;
+ gcc_assert (instance != NULL);
+
+ /* Need to special case __builtin_cmpb because the overloaded forms
+ of this function take (unsigned int, unsigned int) or (unsigned
+ long long int, unsigned long long int). Since C conventions
+ allow the respective argument types to be implicitly coerced into
+ each other, the default handling does not provide adequate
+ discrimination between the desired forms of the function. */
+ if (fcode == RS6000_OVLD_SCAL_CMPB)
+ {
+ machine_mode arg1_mode = TYPE_MODE (types[0]);
+ machine_mode arg2_mode = TYPE_MODE (types[1]);
+
+ if (nargs != 2)
+ {
+ error ("builtin %qs only accepts 2 arguments", "__builtin_cmpb");
+ return error_mark_node;
+ }
+
+ /* If any supplied arguments are wider than 32 bits, resolve to
+ 64-bit variant of built-in function. */
+ if (GET_MODE_PRECISION (arg1_mode) > 32
+ || GET_MODE_PRECISION (arg2_mode) > 32)
+ /* Assure all argument and result types are compatible with
+ the built-in function represented by RS6000_BIF_CMPB. */
+ overloaded_code = RS6000_BIF_CMPB;
+ else
+ /* Assure all argument and result types are compatible with
+ the built-in function represented by RS6000_BIF_CMPB_32. */
+ overloaded_code = RS6000_BIF_CMPB_32;
+
+ while (instance && instance->bifid != overloaded_code)
+ instance = instance->next;
+
+ gcc_assert (instance != NULL);
+ tree fntype = rs6000_builtin_info_x[instance->bifid].fntype;
+ tree parmtype0 = TREE_VALUE (TYPE_ARG_TYPES (fntype));
+ tree parmtype1 = TREE_VALUE (TREE_CHAIN (TYPE_ARG_TYPES (fntype)));
+
+ if (rs6000_new_builtin_type_compatible (types[0], parmtype0)
+ && rs6000_new_builtin_type_compatible (types[1], parmtype1))
+ {
+ if (rs6000_builtin_decl (instance->bifid, false) != error_mark_node
+ && rs6000_new_builtin_is_supported (instance->bifid))
+ {
+ tree ret_type = TREE_TYPE (instance->fntype);
+ return altivec_build_new_resolved_builtin (args, n, fntype,
+ ret_type,
+ instance->bifid,
+ fcode);
+ }
+ else
+ unsupported_builtin = true;
+ }
+ }
+ else if (fcode == RS6000_OVLD_VEC_VSIE)
+ {
+ machine_mode arg1_mode = TYPE_MODE (types[0]);
+
+ if (nargs != 2)
+ {
+ error ("builtin %qs only accepts 2 arguments",
+ "scalar_insert_exp");
+ return error_mark_node;
+ }
+
+ /* If supplied first argument is wider than 64 bits, resolve to
+ 128-bit variant of built-in function. */
+ if (GET_MODE_PRECISION (arg1_mode) > 64)
+ {
+ /* If first argument is of float variety, choose variant
+ that expects __ieee128 argument. Otherwise, expect
+ __int128 argument. */
+ if (GET_MODE_CLASS (arg1_mode) == MODE_FLOAT)
+ overloaded_code = RS6000_BIF_VSIEQPF;
+ else
+ overloaded_code = RS6000_BIF_VSIEQP;
+ }
+ else
+ {
+ /* If first argument is of float variety, choose variant
+ that expects double argument. Otherwise, expect
+ long long int argument. */
+ if (GET_MODE_CLASS (arg1_mode) == MODE_FLOAT)
+ overloaded_code = RS6000_BIF_VSIEDPF;
+ else
+ overloaded_code = RS6000_BIF_VSIEDP;
+ }
+
+ while (instance && instance->bifid != overloaded_code)
+ instance = instance->next;
+
+ gcc_assert (instance != NULL);
+ tree fntype = rs6000_builtin_info_x[instance->bifid].fntype;
+ tree parmtype0 = TREE_VALUE (TYPE_ARG_TYPES (fntype));
+ tree parmtype1 = TREE_VALUE (TREE_CHAIN (TYPE_ARG_TYPES (fntype)));
+
+ if (rs6000_new_builtin_type_compatible (types[0], parmtype0)
+ && rs6000_new_builtin_type_compatible (types[1], parmtype1))
+ {
+ if (rs6000_builtin_decl (instance->bifid, false) != error_mark_node
+ && rs6000_new_builtin_is_supported (instance->bifid))
+ {
+ tree ret_type = TREE_TYPE (instance->fntype);
+ return altivec_build_new_resolved_builtin (args, n, fntype,
+ ret_type,
+ instance->bifid,
+ fcode);
+ }
+ else
+ unsupported_builtin = true;
+ }
+ }
+ else
+ {
+ /* Functions with no arguments can have only one overloaded
+ instance. */
+ gcc_assert (n > 0 || !instance->next);
+
+ for (; instance != NULL; instance = instance->next)
+ {
+ bool mismatch = false;
+ tree nextparm = TYPE_ARG_TYPES (instance->fntype);
+
+ for (unsigned int arg_i = 0;
+ arg_i < nargs && nextparm != NULL;
+ arg_i++)
+ {
+ tree parmtype = TREE_VALUE (nextparm);
+ if (!rs6000_new_builtin_type_compatible (types[arg_i],
+ parmtype))
+ {
+ mismatch = true;
+ break;
+ }
+ nextparm = TREE_CHAIN (nextparm);
+ }
+
+ if (mismatch)
+ continue;
+
+ supported = rs6000_new_builtin_is_supported (instance->bifid);
+ if (rs6000_builtin_decl (instance->bifid, false) != error_mark_node
+ && supported)
+ {
+ tree fntype = rs6000_builtin_info_x[instance->bifid].fntype;
+ tree ret_type = TREE_TYPE (instance->fntype);
+ return altivec_build_new_resolved_builtin (args, n, fntype,
+ ret_type,
+ instance->bifid,
+ fcode);
+ }
+ else
+ {
+ unsupported_builtin = true;
+ break;
+ }
+ }
+ }
+
+ if (unsupported_builtin)
+ {
+ const char *name = rs6000_overload_info[adj_fcode].ovld_name;
+ if (!supported)
+ {
+ const char *internal_name
+ = rs6000_builtin_info_x[instance->bifid].bifname;
+ /* An error message making reference to the name of the
+ non-overloaded function has already been issued. Add
+ clarification of the previous message. */
+ rich_location richloc (line_table, input_location);
+ inform (&richloc, "builtin %qs requires builtin %qs",
+ name, internal_name);
+ }
+ else
+ error ("%qs is not supported in this compiler configuration", name);
+
+ return error_mark_node;
+ }
+ }
+ bad:
+ {
+ const char *name = rs6000_overload_info[adj_fcode].ovld_name;
+ error ("invalid parameter combination for AltiVec intrinsic %qs", name);
+ return error_mark_node;
+ }
+}
diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index e8625d1..a55cb7c 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -12971,6 +12971,59 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi)
return false;
}
+/* Check whether a builtin function is supported in this target
+ configuration. */
+bool
+rs6000_new_builtin_is_supported (enum rs6000_gen_builtins fncode)
+{
+ switch (rs6000_builtin_info_x[(size_t) fncode].enable)
+ {
+ case ENB_ALWAYS:
+ return true;
+ case ENB_P5:
+ return TARGET_POPCNTB;
+ case ENB_P6:
+ return TARGET_CMPB;
+ case ENB_P7:
+ return TARGET_POPCNTD;
+ case ENB_P7_64:
+ return TARGET_POPCNTD && TARGET_POWERPC64;
+ case ENB_P8:
+ return TARGET_DIRECT_MOVE;
+ case ENB_P8V:
+ return TARGET_P8_VECTOR;
+ case ENB_P9:
+ return TARGET_MODULO;
+ case ENB_P9_64:
+ return TARGET_MODULO && TARGET_POWERPC64;
+ case ENB_P9V:
+ return TARGET_P9_VECTOR;
+ case ENB_P10:
+ return TARGET_POWER10;
+ case ENB_P10_64:
+ return TARGET_POWER10 && TARGET_POWERPC64;
+ case ENB_ALTIVEC:
+ return TARGET_ALTIVEC;
+ case ENB_VSX:
+ return TARGET_VSX;
+ case ENB_CELL:
+ return TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL;
+ case ENB_IEEE128_HW:
+ return TARGET_FLOAT128_HW;
+ case ENB_DFP:
+ return TARGET_DFP;
+ case ENB_CRYPTO:
+ return TARGET_CRYPTO;
+ case ENB_HTM:
+ return TARGET_HTM;
+ case ENB_MMA:
+ return TARGET_MMA;
+ default:
+ gcc_unreachable ();
+ }
+ gcc_unreachable ();
+}
+
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c
index f3d6156..f65932e 100644
--- a/gcc/config/rs6000/rs6000-gen-builtins.c
+++ b/gcc/config/rs6000/rs6000-gen-builtins.c
@@ -2314,7 +2314,7 @@ write_decls (void)
fprintf (header_file, "extern void rs6000_init_generated_builtins ();\n\n");
fprintf (header_file,
- "extern bool rs6000_new_builtin_is_supported_p "
+ "extern bool rs6000_new_builtin_is_supported "
"(rs6000_gen_builtins);\n");
fprintf (header_file,
"extern tree rs6000_builtin_decl (unsigned, "
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 2570937..ad81dfb 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -21728,7 +21728,8 @@ rs6000_xcoff_encode_section_info (tree decl, rtx rtl, int first)
if (decl
&& DECL_P (decl)
&& VAR_OR_FUNCTION_DECL_P (decl)
- && symtab_node::get (decl)->alias == 0
+ && (symtab_node::get (decl) == NULL
+ || symtab_node::get (decl)->alias == 0)
&& symname[strlen (symname) - 1] != ']')
{
const char *smclass = NULL;
@@ -22174,7 +22175,7 @@ rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code,
break;
case UNSPEC:
- if (XINT (x, 1) == UNSPEC_MMA_XXSETACCZ)
+ if (XINT (x, 1) == UNSPECV_MMA_XXSETACCZ)
{
*total = 0;
return true;
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 3753de1..c1cb9ab 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -561,10 +561,6 @@ mpower9-minmax
Target Undocumented Mask(P9_MINMAX) Var(rs6000_isa_flags)
Use the new min/max instructions defined in ISA 3.0.
-mtoc-fusion
-Target Undocumented Mask(TOC_FUSION) Var(rs6000_isa_flags)
-Fuse medium/large code model toc references with the memory instruction.
-
mmodulo
Target Undocumented Mask(MODULO) Var(rs6000_isa_flags)
Generate the integer modulo instructions.
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index 92766d8..d48a4b1 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -44,15 +44,11 @@ rs6000-logue.o: $(srcdir)/config/rs6000/rs6000-logue.c
$(COMPILE) $<
$(POSTCOMPILE)
-rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.c
- $(COMPILE) $<
- $(POSTCOMPILE)
-
-rbtree.o: $(srcdir)/config/rs6000/rbtree.c
- $(COMPILE) $<
- $(POSTCOMPILE)
+build/rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.c
+build/rbtree.o: $(srcdir)/config/rs6000/rbtree.c
-rs6000-gen-builtins: rs6000-gen-builtins.o rbtree.o
+build/rs6000-gen-builtins$(build_exeext): build/rs6000-gen-builtins.o \
+ build/rbtree.o $(BUILD_LIBDEPS)
$(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ \
$(filter-out $(BUILD_LIBDEPS), $^) $(BUILD_LIBS)
@@ -62,10 +58,11 @@ rs6000-gen-builtins: rs6000-gen-builtins.o rbtree.o
# <recipe>
# For now, the header files depend on rs6000-builtins.c, which avoids
# races because the .c file is closed last in rs6000-gen-builtins.c.
-rs6000-builtins.c: rs6000-gen-builtins \
+rs6000-builtins.c: build/rs6000-gen-builtins$(build_exeext) \
$(srcdir)/config/rs6000/rs6000-builtin-new.def \
$(srcdir)/config/rs6000/rs6000-overload.def
- ./rs6000-gen-builtins $(srcdir)/config/rs6000/rs6000-builtin-new.def \
+ $(RUN_GEN) ./build/rs6000-gen-builtins$(build_exeext) \
+ $(srcdir)/config/rs6000/rs6000-builtin-new.def \
$(srcdir)/config/rs6000/rs6000-overload.def rs6000-builtins.h \
rs6000-builtins.c rs6000-vecdefines.h
diff --git a/gcc/config/sparc/leon5.md b/gcc/config/sparc/leon5.md
new file mode 100644
index 0000000..6a065b1
--- /dev/null
+++ b/gcc/config/sparc/leon5.md
@@ -0,0 +1,103 @@
+;; Scheduling description for LEON5.
+;; Copyright (C) 2021 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+
+;; The LEON5 can often dual issue instructions from the same 64-bit aligned
+;; double word if there are no data dependencies.
+;;
+;; Avoid scheduling load/store, FPU, and multiply instructions back to
+;; back, regardless of data dependencies.
+;;
+;; Push comparisons away from the associated branch instruction.
+;;
+;; Avoid scheduling ALU instructions with data dependencies back to back.
+;;
+;; Schedule three instructions between load and dependent instruction.
+
+(define_automaton "leon5")
+
+(define_cpu_unit "leon5_memory" "leon5")
+(define_cpu_unit "leon5_mul" "leon5")
+(define_cpu_unit "grfpu_d" "grfpu")
+(define_cpu_unit "grfpu_s" "grfpu")
+
+(define_insn_reservation "leon5_load" 4
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "load,sload"))
+ "leon5_memory * 2, nothing * 2")
+
+(define_insn_reservation "leon5_fpload" 2
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "fpload"))
+ "leon5_memory * 2 + grfpu_alu * 2")
+
+(define_insn_reservation "leon5_store" 2
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "store"))
+ "leon5_memory * 2")
+
+(define_insn_reservation "leon5_fpstore" 2
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "fpstore"))
+ "leon5_memory * 2 + grfpu_alu * 2")
+
+(define_insn_reservation "leon5_ialu" 2
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "ialu, shift, ialuX"))
+ "nothing * 2")
+
+(define_insn_reservation "leon5_compare" 5
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "compare"))
+ "nothing * 5")
+
+(define_insn_reservation "leon5_imul" 4
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "imul"))
+ "leon5_mul * 2, nothing * 2")
+
+(define_insn_reservation "leon5_idiv" 35
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "imul"))
+ "nothing * 35")
+
+(define_insn_reservation "leon5_fp_alu" 5
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "fp,fpcmp,fpmul,fpmove"))
+ "grfpu_alu * 2, nothing*3")
+
+(define_insn_reservation "leon5_fp_divs" 17
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "fpdivs"))
+ "grfpu_alu * 2 + grfpu_d*16, nothing")
+
+(define_insn_reservation "leon5_fp_divd" 18
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "fpdivd"))
+ "grfpu_alu * 2 + grfpu_d*17, nothing")
+
+(define_insn_reservation "leon5_fp_sqrts" 25
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "fpsqrts"))
+ "grfpu_alu * 2 + grfpu_s*24, nothing")
+
+(define_insn_reservation "leon5_fp_sqrtd" 26
+ (and (eq_attr "cpu" "leon5")
+ (eq_attr "type" "fpsqrtd"))
+ "grfpu_alu * 2 + grfpu_s*25, nothing")
diff --git a/gcc/config/sparc/sparc-opts.h b/gcc/config/sparc/sparc-opts.h
index 1af556e..9299cf6 100644
--- a/gcc/config/sparc/sparc-opts.h
+++ b/gcc/config/sparc/sparc-opts.h
@@ -31,6 +31,7 @@ enum sparc_processor_type {
PROCESSOR_HYPERSPARC,
PROCESSOR_LEON,
PROCESSOR_LEON3,
+ PROCESSOR_LEON5,
PROCESSOR_LEON3V7,
PROCESSOR_SPARCLITE,
PROCESSOR_F930,
diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
index 06f41d7..6bc6f0a 100644
--- a/gcc/config/sparc/sparc.c
+++ b/gcc/config/sparc/sparc.c
@@ -270,6 +270,31 @@ struct processor_costs leon3_costs = {
};
static const
+struct processor_costs leon5_costs = {
+ COSTS_N_INSNS (1), /* int load */
+ COSTS_N_INSNS (1), /* int signed load */
+ COSTS_N_INSNS (1), /* int zeroed load */
+ COSTS_N_INSNS (1), /* float load */
+ COSTS_N_INSNS (1), /* fmov, fneg, fabs */
+ COSTS_N_INSNS (1), /* fadd, fsub */
+ COSTS_N_INSNS (1), /* fcmp */
+ COSTS_N_INSNS (1), /* fmov, fmovr */
+ COSTS_N_INSNS (1), /* fmul */
+ COSTS_N_INSNS (17), /* fdivs */
+ COSTS_N_INSNS (18), /* fdivd */
+ COSTS_N_INSNS (25), /* fsqrts */
+ COSTS_N_INSNS (26), /* fsqrtd */
+ COSTS_N_INSNS (4), /* imul */
+ COSTS_N_INSNS (4), /* imulX */
+ 0, /* imul bit factor */
+ COSTS_N_INSNS (35), /* idiv */
+ COSTS_N_INSNS (35), /* idivX */
+ COSTS_N_INSNS (1), /* movcc/movr */
+ 0, /* shift penalty */
+ 3 /* branch cost */
+};
+
+static const
struct processor_costs sparclet_costs = {
COSTS_N_INSNS (3), /* int load */
COSTS_N_INSNS (3), /* int signed load */
@@ -575,6 +600,7 @@ static int function_arg_slotno (const CUMULATIVE_ARGS *, machine_mode,
static int supersparc_adjust_cost (rtx_insn *, int, rtx_insn *, int);
static int hypersparc_adjust_cost (rtx_insn *, int, rtx_insn *, int);
+static int leon5_adjust_cost (rtx_insn *, int, rtx_insn *, int);
static void sparc_emit_set_const32 (rtx, rtx);
static void sparc_emit_set_const64 (rtx, rtx);
@@ -1045,6 +1071,43 @@ atomic_insn_for_leon3_p (rtx_insn *insn)
}
}
+/* True if INSN is a store instruction. */
+
+static bool
+store_insn_p (rtx_insn *insn)
+{
+ if (GET_CODE (PATTERN (insn)) != SET)
+ return false;
+
+ switch (get_attr_type (insn))
+ {
+ case TYPE_STORE:
+ case TYPE_FPSTORE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* True if INSN is a load instruction. */
+
+static bool
+load_insn_p (rtx_insn *insn)
+{
+ if (GET_CODE (PATTERN (insn)) != SET)
+ return false;
+
+ switch (get_attr_type (insn))
+ {
+ case TYPE_LOAD:
+ case TYPE_SLOAD:
+ case TYPE_FPLOAD:
+ return true;
+ default:
+ return false;
+ }
+}
+
/* We use a machine specific pass to enable workarounds for errata.
We need to have the (essentially) final form of the insn stream in order
@@ -1057,10 +1120,29 @@ atomic_insn_for_leon3_p (rtx_insn *insn)
&& GET_CODE (PATTERN (INSN)) != USE \
&& GET_CODE (PATTERN (INSN)) != CLOBBER)
+rtx_insn *
+next_active_non_empty_insn (rtx_insn *insn)
+{
+ insn = next_active_insn (insn);
+
+ while (insn
+ && (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+ || GET_CODE (PATTERN (insn)) == ASM_INPUT
+ || (USEFUL_INSN_P (insn)
+ && (asm_noperands (PATTERN (insn)) >= 0)
+ && !strcmp (decode_asm_operands (PATTERN (insn),
+ NULL, NULL, NULL,
+ NULL, NULL), ""))))
+ insn = next_active_insn (insn);
+
+ return insn;
+}
+
static unsigned int
sparc_do_work_around_errata (void)
{
rtx_insn *insn, *next;
+ bool find_first_useful = true;
/* Force all instructions to be split into their final form. */
split_all_insns_noflow ();
@@ -1085,6 +1167,16 @@ sparc_do_work_around_errata (void)
else
jump = NULL;
+ /* Do not begin function with atomic instruction. */
+ if (sparc_fix_ut700
+ && find_first_useful
+ && USEFUL_INSN_P (insn))
+ {
+ find_first_useful = false;
+ if (atomic_insn_for_leon3_p (insn))
+ emit_insn_before (gen_nop (), insn);
+ }
+
/* Place a NOP at the branch target of an integer branch if it is a
floating-point operation or a floating-point branch. */
if (sparc_fix_gr712rc
@@ -1105,9 +1197,7 @@ sparc_do_work_around_errata (void)
instruction at branch target. */
if (sparc_fix_ut700
&& NONJUMP_INSN_P (insn)
- && (set = single_set (insn)) != NULL_RTX
- && mem_ref (SET_SRC (set))
- && REG_P (SET_DEST (set)))
+ && load_insn_p (insn))
{
if (jump && jump_to_label_p (jump))
{
@@ -1116,7 +1206,7 @@ sparc_do_work_around_errata (void)
emit_insn_before (gen_nop (), target);
}
- next = next_active_insn (insn);
+ next = next_active_non_empty_insn (insn);
if (!next)
break;
@@ -1212,30 +1302,19 @@ sparc_do_work_around_errata (void)
if (sparc_fix_b2bst
&& NONJUMP_INSN_P (insn)
&& (set = single_set (insn)) != NULL_RTX
- && MEM_P (SET_DEST (set)))
+ && store_insn_p (insn))
{
/* Sequence B begins with a double-word store. */
bool seq_b = GET_MODE_SIZE (GET_MODE (SET_DEST (set))) == 8;
rtx_insn *after;
int i;
- next = next_active_insn (insn);
+ next = next_active_non_empty_insn (insn);
if (!next)
break;
for (after = next, i = 0; i < 2; i++)
{
- /* Skip empty assembly statements. */
- if ((GET_CODE (PATTERN (after)) == UNSPEC_VOLATILE)
- || (USEFUL_INSN_P (after)
- && (asm_noperands (PATTERN (after))>=0)
- && !strcmp (decode_asm_operands (PATTERN (after),
- NULL, NULL, NULL,
- NULL, NULL), "")))
- after = next_active_insn (after);
- if (!after)
- break;
-
/* If the insn is a branch, then it cannot be problematic. */
if (!NONJUMP_INSN_P (after)
|| GET_CODE (PATTERN (after)) == SEQUENCE)
@@ -1245,8 +1324,7 @@ sparc_do_work_around_errata (void)
if (seq_b)
{
/* Add NOP if followed by a store. */
- if ((set = single_set (after)) != NULL_RTX
- && MEM_P (SET_DEST (set)))
+ if (store_insn_p (after))
insert_nop = true;
/* Otherwise it is ok. */
@@ -1261,15 +1339,14 @@ sparc_do_work_around_errata (void)
&& (MEM_P (SET_DEST (set)) || mem_ref (SET_SRC (set))))
break;
- after = next_active_insn (after);
+ after = next_active_non_empty_insn (after);
if (!after)
break;
}
/* Add NOP if third instruction is a store. */
if (i == 1
- && (set = single_set (after)) != NULL_RTX
- && MEM_P (SET_DEST (set)))
+ && store_insn_p (after))
insert_nop = true;
}
}
@@ -1596,6 +1673,10 @@ dump_target_flag_bits (const int flags)
fprintf (stderr, "CBCOND ");
if (flags & MASK_DEPRECATED_V8_INSNS)
fprintf (stderr, "DEPRECATED_V8_INSNS ");
+ if (flags & MASK_LEON)
+ fprintf (stderr, "LEON ");
+ if (flags & MASK_LEON3)
+ fprintf (stderr, "LEON3 ");
if (flags & MASK_SPARCLET)
fprintf (stderr, "SPARCLET ");
if (flags & MASK_SPARCLITE)
@@ -1632,6 +1713,7 @@ sparc_option_override (void)
{ TARGET_CPU_hypersparc, PROCESSOR_HYPERSPARC },
{ TARGET_CPU_leon, PROCESSOR_LEON },
{ TARGET_CPU_leon3, PROCESSOR_LEON3 },
+ { TARGET_CPU_leon5, PROCESSOR_LEON5 },
{ TARGET_CPU_leon3v7, PROCESSOR_LEON3V7 },
{ TARGET_CPU_sparclite, PROCESSOR_F930 },
{ TARGET_CPU_sparclite86x, PROCESSOR_SPARCLITE86X },
@@ -1663,6 +1745,7 @@ sparc_option_override (void)
{ "hypersparc", MASK_ISA, MASK_V8 },
{ "leon", MASK_ISA|MASK_FSMULD, MASK_V8|MASK_LEON },
{ "leon3", MASK_ISA, MASK_V8|MASK_LEON3 },
+ { "leon5", MASK_ISA, MASK_V8|MASK_LEON3 },
{ "leon3v7", MASK_ISA, MASK_LEON3 },
{ "sparclite", MASK_ISA, MASK_SPARCLITE },
/* The Fujitsu MB86930 is the original sparclite chip, with no FPU. */
@@ -1973,6 +2056,9 @@ sparc_option_override (void)
case PROCESSOR_LEON3V7:
sparc_costs = &leon3_costs;
break;
+ case PROCESSOR_LEON5:
+ sparc_costs = &leon5_costs;
+ break;
case PROCESSOR_SPARCLET:
case PROCESSOR_TSC701:
sparc_costs = &sparclet_costs;
@@ -10120,11 +10206,64 @@ hypersparc_adjust_cost (rtx_insn *insn, int dtype, rtx_insn *dep_insn,
}
static int
+leon5_adjust_cost (rtx_insn *insn, int dtype, rtx_insn *dep_insn,
+ int cost)
+{
+ enum attr_type insn_type, dep_type;
+ rtx pat = PATTERN (insn);
+ rtx dep_pat = PATTERN (dep_insn);
+
+ if (recog_memoized (insn) < 0 || recog_memoized (dep_insn) < 0)
+ return cost;
+
+ insn_type = get_attr_type (insn);
+ dep_type = get_attr_type (dep_insn);
+
+ switch (dtype)
+ {
+ case REG_DEP_TRUE:
+ /* Data dependency; DEP_INSN writes a register that INSN reads some
+ cycles later. */
+
+ switch (insn_type)
+ {
+ case TYPE_STORE:
+ /* Try to schedule three instructions between the store and
+ the ALU instruction that generated the data. */
+ if (dep_type == TYPE_IALU || dep_type == TYPE_SHIFT)
+ {
+ if (GET_CODE (pat) != SET || GET_CODE (dep_pat) != SET)
+ break;
+
+ if (rtx_equal_p (SET_DEST (dep_pat), SET_SRC (pat)))
+ return 4;
+ }
+ break;
+ default:
+ break;
+ }
+ break;
+ case REG_DEP_ANTI:
+ /* Penalize anti-dependencies for FPU instructions. */
+ if (fpop_insn_p (insn) || insn_type == TYPE_FPLOAD)
+ return 4;
+ break;
+ default:
+ break;
+ }
+
+ return cost;
+}
+
+static int
sparc_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep, int cost,
unsigned int)
{
switch (sparc_cpu)
{
+ case PROCESSOR_LEON5:
+ cost = leon5_adjust_cost (insn, dep_type, dep, cost);
+ break;
case PROCESSOR_SUPERSPARC:
cost = supersparc_adjust_cost (insn, dep_type, dep, cost);
break;
diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h
index 4da5a06..edafa99 100644
--- a/gcc/config/sparc/sparc.h
+++ b/gcc/config/sparc/sparc.h
@@ -120,21 +120,22 @@ along with GCC; see the file COPYING3. If not see
#define TARGET_CPU_leon 4
#define TARGET_CPU_leon3 5
#define TARGET_CPU_leon3v7 6
-#define TARGET_CPU_sparclite 7
-#define TARGET_CPU_f930 7 /* alias */
-#define TARGET_CPU_f934 7 /* alias */
-#define TARGET_CPU_sparclite86x 8
-#define TARGET_CPU_sparclet 9
-#define TARGET_CPU_tsc701 9 /* alias */
-#define TARGET_CPU_v9 10 /* generic v9 implementation */
-#define TARGET_CPU_sparcv9 10 /* alias */
-#define TARGET_CPU_sparc64 10 /* alias */
-#define TARGET_CPU_ultrasparc 11
-#define TARGET_CPU_ultrasparc3 12
-#define TARGET_CPU_niagara 13
-#define TARGET_CPU_niagara2 14
-#define TARGET_CPU_niagara3 15
-#define TARGET_CPU_niagara4 16
+#define TARGET_CPU_leon5 7
+#define TARGET_CPU_sparclite 8
+#define TARGET_CPU_f930 8 /* alias */
+#define TARGET_CPU_f934 8 /* alias */
+#define TARGET_CPU_sparclite86x 9
+#define TARGET_CPU_sparclet 10
+#define TARGET_CPU_tsc701 10 /* alias */
+#define TARGET_CPU_v9 11 /* generic v9 implementation */
+#define TARGET_CPU_sparcv9 11 /* alias */
+#define TARGET_CPU_sparc64 11 /* alias */
+#define TARGET_CPU_ultrasparc 12
+#define TARGET_CPU_ultrasparc3 13
+#define TARGET_CPU_niagara 14
+#define TARGET_CPU_niagara2 15
+#define TARGET_CPU_niagara3 16
+#define TARGET_CPU_niagara4 17
#define TARGET_CPU_niagara7 19
#define TARGET_CPU_m8 20
@@ -229,7 +230,8 @@ along with GCC; see the file COPYING3. If not see
#endif
#if TARGET_CPU_DEFAULT == TARGET_CPU_leon \
- || TARGET_CPU_DEFAULT == TARGET_CPU_leon3
+ || TARGET_CPU_DEFAULT == TARGET_CPU_leon3 \
+ || TARGET_CPU_DEFAULT == TARGET_CPU_leon5
#define CPP_CPU32_DEFAULT_SPEC "-D__leon__ -D__sparc_v8__"
#define ASM_CPU32_DEFAULT_SPEC AS_LEON_FLAG
#endif
@@ -285,6 +287,7 @@ along with GCC; see the file COPYING3. If not see
%{mcpu=hypersparc:-D__hypersparc__ -D__sparc_v8__} \
%{mcpu=leon:-D__leon__ -D__sparc_v8__} \
%{mcpu=leon3:-D__leon__ -D__sparc_v8__} \
+%{mcpu=leon5:-D__leon__ -D__sparc_v8__} \
%{mcpu=leon3v7:-D__leon__} \
%{mcpu=v9:-D__sparc_v9__} \
%{mcpu=ultrasparc:-D__sparc_v9__} \
@@ -337,6 +340,7 @@ along with GCC; see the file COPYING3. If not see
%{mcpu=hypersparc:-Av8} \
%{mcpu=leon:" AS_LEON_FLAG "} \
%{mcpu=leon3:" AS_LEON_FLAG "} \
+%{mcpu=leon5:" AS_LEON_FLAG "} \
%{mcpu=leon3v7:" AS_LEONV7_FLAG "} \
%{mv8plus:-Av8plus} \
%{mcpu=v9:-Av9} \
diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md
index 24b76e0..294c918 100644
--- a/gcc/config/sparc/sparc.md
+++ b/gcc/config/sparc/sparc.md
@@ -233,6 +233,7 @@
hypersparc,
leon,
leon3,
+ leon5,
leon3v7,
sparclite,
f930,
@@ -638,6 +639,7 @@
(include "supersparc.md")
(include "hypersparc.md")
(include "leon.md")
+(include "leon5.md")
(include "sparclet.md")
(include "ultra1_2.md")
(include "ultra3.md")
@@ -8353,9 +8355,15 @@ visl")
(unspec:SI [(match_operand:SI 1 "memory_operand" "m")] UNSPEC_SP_SET))
(set (match_scratch:SI 2 "=&r") (const_int 0))]
"TARGET_ARCH32"
- "ld\t%1, %2\;st\t%2, %0\;mov\t0, %2"
+{
+ if (sparc_fix_b2bst)
+ return "ld\t%1, %2\;st\t%2, %0\;mov\t0, %2\;nop";
+ else
+ return "ld\t%1, %2\;st\t%2, %0\;mov\t0, %2";
+}
[(set_attr "type" "multi")
- (set_attr "length" "3")])
+ (set (attr "length") (if_then_else (eq_attr "fix_b2bst" "true")
+ (const_int 4) (const_int 3)))])
(define_insn "stack_protect_set64"
[(set (match_operand:DI 0 "memory_operand" "=m")
diff --git a/gcc/config/sparc/sparc.opt b/gcc/config/sparc/sparc.opt
index fb79267..658a187 100644
--- a/gcc/config/sparc/sparc.opt
+++ b/gcc/config/sparc/sparc.opt
@@ -176,6 +176,9 @@ EnumValue
Enum(sparc_processor) String(leon3v7) Value(PROCESSOR_LEON3V7)
EnumValue
+Enum(sparc_processor) String(leon5) Value(PROCESSOR_LEON5)
+
+EnumValue
Enum(sparc_processor) String(sparclite) Value(PROCESSOR_SPARCLITE)
EnumValue
diff --git a/gcc/config/xtensa/t-xtensa b/gcc/config/xtensa/t-xtensa
index 973815c..d06e492 100644
--- a/gcc/config/xtensa/t-xtensa
+++ b/gcc/config/xtensa/t-xtensa
@@ -16,4 +16,5 @@
# along with GCC; see the file COPYING3. If not see
# <http://www.gnu.org/licenses/>.
+TM_H += $(srcdir)/../include/xtensa-config.h
$(out_object_file): gt-xtensa.h