diff options
author | Ian Lance Taylor <iant@golang.org> | 2021-09-17 08:46:39 -0700 |
---|---|---|
committer | Ian Lance Taylor <iant@golang.org> | 2021-09-17 08:46:39 -0700 |
commit | a0791d0ed4f147ef347e83f4aedc7ad03f1a2008 (patch) | |
tree | 7b3526910798e4cff7a7200d684383046bac6225 /gcc/config | |
parent | e252b51ccde010cbd2a146485d8045103cd99533 (diff) | |
parent | 89be17a1b231ade643f28fbe616d53377e069da8 (diff) | |
download | gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.zip gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.tar.gz gcc-a0791d0ed4f147ef347e83f4aedc7ad03f1a2008.tar.bz2 |
Merge from trunk revision 89be17a1b231ade643f28fbe616d53377e069da8.
Diffstat (limited to 'gcc/config')
36 files changed, 8988 insertions, 748 deletions
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index 90ba85e..4919d27 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -4966,8 +4966,8 @@ core_3, archs4x, archs4xd, archs4xd_slow" (const_int 1)) (label_ref (match_operand 1 "" "")) (pc))) - (set (match_dup 0) (plus (match_dup 0) (const_int -1))) - (unspec [(const_int 0)] UNSPEC_ARC_LP) + (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1))) + (unspec:SI [(const_int 0)] UNSPEC_ARC_LP) (clobber (match_dup 2))])] "" { @@ -4996,8 +4996,8 @@ core_3, archs4x, archs4xd, archs4xd_slow" (const_int 1)) (label_ref (match_operand 1 "" "")) (pc))) - (set (match_dup 0) (plus (match_dup 0) (const_int -1))) - (unspec [(const_int 0)] UNSPEC_ARC_LP) + (set (match_dup 0) (plus:SI (match_dup 0) (const_int -1))) + (unspec:SI [(const_int 0)] UNSPEC_ARC_LP) (clobber (match_scratch:SI 2 "=X,&r"))] "" "@ diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index ed8ad84..a5041ed 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -192,6 +192,159 @@ _mm512_setzero_ph (void) return _mm512_set1_ph (0.0f); } +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_undefined_ph (void) +{ + __m128h __Y = __Y; + return __Y; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_undefined_ph (void) +{ + __m256h __Y = __Y; + return __Y; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_undefined_ph (void) +{ + __m512h __Y = __Y; + return __Y; +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_h (__m128h __A) +{ + return __A[0]; +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsh_h (__m256h __A) +{ + return __A[0]; +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsh_h (__m512h __A) +{ + return __A[0]; +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph_ps (__m512h __a) +{ + return (__m512) __a; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph_pd (__m512h __a) +{ + return (__m512d) __a; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph_si512 (__m512h __a) +{ + return (__m512i) __a; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph512_ph128 (__m512h __A) +{ + union + { + __m128h a[4]; + __m512h v; + } u = { .v = __A }; + return u.a[0]; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph512_ph256 (__m512h __A) +{ + union + { + __m256h a[2]; + __m512h v; + } u = { .v = __A }; + return u.a[0]; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph128_ph512 (__m128h __A) +{ + union + { + __m128h a[4]; + __m512h v; + } u; + u.a[0] = __A; + return u.v; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph256_ph512 (__m256h __A) +{ + union + { + __m256h a[2]; + __m512h v; + } u; + u.a[0] = __A; + return u.v; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextph128_ph512 (__m128h __A) +{ + return (__m512h) _mm512_insertf32x4 (_mm512_setzero_ps (), + (__m128) __A, 0); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextph256_ph512 (__m256h __A) +{ + return (__m512h) _mm512_insertf64x4 (_mm512_setzero_pd (), + (__m256d) __A, 0); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps_ph (__m512 __a) +{ + return (__m512h) __a; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd_ph (__m512d __a) +{ + return (__m512h) __a; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi512_ph (__m512i __a) +{ + return (__m512h) __a; +} + /* Create a vector with element 0 as F and the rest zero. */ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -229,15 +382,15 @@ extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) { - return __builtin_ia32_vaddph_v32hf_mask (__C, __D, __A, __B); + return __builtin_ia32_addph512_mask (__C, __D, __A, __B); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C) { - return __builtin_ia32_vaddph_v32hf_mask (__B, __C, - _mm512_setzero_ph (), __A); + return __builtin_ia32_addph512_mask (__B, __C, + _mm512_setzero_ph (), __A); } extern __inline __m512h @@ -251,15 +404,15 @@ extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) { - return __builtin_ia32_vsubph_v32hf_mask (__C, __D, __A, __B); + return __builtin_ia32_subph512_mask (__C, __D, __A, __B); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C) { - return __builtin_ia32_vsubph_v32hf_mask (__B, __C, - _mm512_setzero_ph (), __A); + return __builtin_ia32_subph512_mask (__B, __C, + _mm512_setzero_ph (), __A); } extern __inline __m512h @@ -273,15 +426,15 @@ extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) { - return __builtin_ia32_vmulph_v32hf_mask (__C, __D, __A, __B); + return __builtin_ia32_mulph512_mask (__C, __D, __A, __B); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C) { - return __builtin_ia32_vmulph_v32hf_mask (__B, __C, - _mm512_setzero_ph (), __A); + return __builtin_ia32_mulph512_mask (__B, __C, + _mm512_setzero_ph (), __A); } extern __inline __m512h @@ -295,15 +448,15 @@ extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) { - return __builtin_ia32_vdivph_v32hf_mask (__C, __D, __A, __B); + return __builtin_ia32_divph512_mask (__C, __D, __A, __B); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C) { - return __builtin_ia32_vdivph_v32hf_mask (__B, __C, - _mm512_setzero_ph (), __A); + return __builtin_ia32_divph512_mask (__B, __C, + _mm512_setzero_ph (), __A); } #ifdef __OPTIMIZE__ @@ -311,9 +464,9 @@ extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_add_round_ph (__m512h __A, __m512h __B, const int __C) { - return __builtin_ia32_vaddph_v32hf_mask_round (__A, __B, - _mm512_setzero_ph (), - (__mmask32) -1, __C); + return __builtin_ia32_addph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); } extern __inline __m512h @@ -321,7 +474,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D, const int __E) { - return __builtin_ia32_vaddph_v32hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_addph512_mask_round (__C, __D, __A, __B, __E); } extern __inline __m512h @@ -329,18 +482,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C, const int __D) { - return __builtin_ia32_vaddph_v32hf_mask_round (__B, __C, - _mm512_setzero_ph (), - __A, __D); + return __builtin_ia32_addph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C) { - return __builtin_ia32_vsubph_v32hf_mask_round (__A, __B, - _mm512_setzero_ph (), - (__mmask32) -1, __C); + return __builtin_ia32_subph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); } extern __inline __m512h @@ -348,7 +501,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D, const int __E) { - return __builtin_ia32_vsubph_v32hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_subph512_mask_round (__C, __D, __A, __B, __E); } extern __inline __m512h @@ -356,18 +509,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C, const int __D) { - return __builtin_ia32_vsubph_v32hf_mask_round (__B, __C, - _mm512_setzero_ph (), - __A, __D); + return __builtin_ia32_subph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C) { - return __builtin_ia32_vmulph_v32hf_mask_round (__A, __B, - _mm512_setzero_ph (), - (__mmask32) -1, __C); + return __builtin_ia32_mulph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); } extern __inline __m512h @@ -375,7 +528,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D, const int __E) { - return __builtin_ia32_vmulph_v32hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_mulph512_mask_round (__C, __D, __A, __B, __E); } extern __inline __m512h @@ -383,18 +536,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C, const int __D) { - return __builtin_ia32_vmulph_v32hf_mask_round (__B, __C, - _mm512_setzero_ph (), - __A, __D); + return __builtin_ia32_mulph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_div_round_ph (__m512h __A, __m512h __B, const int __C) { - return __builtin_ia32_vdivph_v32hf_mask_round (__A, __B, - _mm512_setzero_ph (), - (__mmask32) -1, __C); + return __builtin_ia32_divph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); } extern __inline __m512h @@ -402,7 +555,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D, const int __E) { - return __builtin_ia32_vdivph_v32hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_divph512_mask_round (__C, __D, __A, __B, __E); } extern __inline __m512h @@ -410,67 +563,67 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C, const int __D) { - return __builtin_ia32_vdivph_v32hf_mask_round (__B, __C, - _mm512_setzero_ph (), - __A, __D); + return __builtin_ia32_divph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); } #else #define _mm512_add_round_ph(A, B, C) \ - ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((A), (B), \ - _mm512_setzero_ph (),\ - (__mmask32)-1, (C))) + ((__m512h)__builtin_ia32_addph512_mask_round((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) -#define _mm512_mask_add_round_ph(A, B, C, D, E) \ - ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((C), (D), (A), (B), (E))) +#define _mm512_mask_add_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_addph512_mask_round((C), (D), (A), (B), (E))) #define _mm512_maskz_add_round_ph(A, B, C, D) \ - ((__m512h)__builtin_ia32_vaddph_v32hf_mask_round((B), (C), \ - _mm512_setzero_ph (),\ - (A), (D))) + ((__m512h)__builtin_ia32_addph512_mask_round((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) #define _mm512_sub_round_ph(A, B, C) \ - ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((A), (B), \ - _mm512_setzero_ph (),\ - (__mmask32)-1, (C))) + ((__m512h)__builtin_ia32_subph512_mask_round((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) -#define _mm512_mask_sub_round_ph(A, B, C, D, E) \ - ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((C), (D), (A), (B), (E))) +#define _mm512_mask_sub_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_subph512_mask_round((C), (D), (A), (B), (E))) #define _mm512_maskz_sub_round_ph(A, B, C, D) \ - ((__m512h)__builtin_ia32_vsubph_v32hf_mask_round((B), (C), \ - _mm512_setzero_ph (),\ - (A), (D))) + ((__m512h)__builtin_ia32_subph512_mask_round((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) #define _mm512_mul_round_ph(A, B, C) \ - ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((A), (B), \ - _mm512_setzero_ph (),\ - (__mmask32)-1, (C))) + ((__m512h)__builtin_ia32_mulph512_mask_round((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) -#define _mm512_mask_mul_round_ph(A, B, C, D, E) \ - ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((C), (D), (A), (B), (E))) +#define _mm512_mask_mul_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_mulph512_mask_round((C), (D), (A), (B), (E))) #define _mm512_maskz_mul_round_ph(A, B, C, D) \ - ((__m512h)__builtin_ia32_vmulph_v32hf_mask_round((B), (C), \ - _mm512_setzero_ph (),\ - (A), (D))) + ((__m512h)__builtin_ia32_mulph512_mask_round((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) #define _mm512_div_round_ph(A, B, C) \ - ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((A), (B), \ - _mm512_setzero_ph (),\ - (__mmask32)-1, (C))) + ((__m512h)__builtin_ia32_divph512_mask_round((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) -#define _mm512_mask_div_round_ph(A, B, C, D, E) \ - ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((C), (D), (A), (B), (E))) +#define _mm512_mask_div_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_divph512_mask_round((C), (D), (A), (B), (E))) #define _mm512_maskz_div_round_ph(A, B, C, D) \ - ((__m512h)__builtin_ia32_vdivph_v32hf_mask_round((B), (C), \ - _mm512_setzero_ph (),\ - (A), (D))) + ((__m512h)__builtin_ia32_divph512_mask_round((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) #endif /* __OPTIMIZE__ */ /* Intrinsics of v[add,sub,mul,div]sh. */ extern __inline __m128h -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_sh (__m128h __A, __m128h __B) { __A[0] += __B[0]; @@ -481,15 +634,15 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_add_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vaddsh_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_addsh_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_add_sh (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vaddsh_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_addsh_mask (__B, __C, _mm_setzero_ph (), + __A); } extern __inline __m128h @@ -504,15 +657,15 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_sub_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vsubsh_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_subsh_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_sub_sh (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vsubsh_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_subsh_mask (__B, __C, _mm_setzero_ph (), + __A); } extern __inline __m128h @@ -527,14 +680,14 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_mul_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vmulsh_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_mulsh_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_mul_sh (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vmulsh_v8hf_mask (__B, __C, _mm_setzero_ph (), __A); + return __builtin_ia32_mulsh_mask (__B, __C, _mm_setzero_ph (), __A); } extern __inline __m128h @@ -549,15 +702,15 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_div_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vdivsh_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_divsh_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_div_sh (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vdivsh_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_divsh_mask (__B, __C, _mm_setzero_ph (), + __A); } #ifdef __OPTIMIZE__ @@ -565,9 +718,9 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_round_sh (__m128h __A, __m128h __B, const int __C) { - return __builtin_ia32_vaddsh_v8hf_mask_round (__A, __B, - _mm_setzero_ph (), - (__mmask8) -1, __C); + return __builtin_ia32_addsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); } extern __inline __m128h @@ -575,7 +728,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_add_round_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D, const int __E) { - return __builtin_ia32_vaddsh_v8hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_addsh_mask_round (__C, __D, __A, __B, __E); } extern __inline __m128h @@ -583,18 +736,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_add_round_sh (__mmask8 __A, __m128h __B, __m128h __C, const int __D) { - return __builtin_ia32_vaddsh_v8hf_mask_round (__B, __C, - _mm_setzero_ph (), - __A, __D); + return __builtin_ia32_addsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_round_sh (__m128h __A, __m128h __B, const int __C) { - return __builtin_ia32_vsubsh_v8hf_mask_round (__A, __B, - _mm_setzero_ph (), - (__mmask8) -1, __C); + return __builtin_ia32_subsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); } extern __inline __m128h @@ -602,7 +755,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_sub_round_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D, const int __E) { - return __builtin_ia32_vsubsh_v8hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_subsh_mask_round (__C, __D, __A, __B, __E); } extern __inline __m128h @@ -610,18 +763,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_sub_round_sh (__mmask8 __A, __m128h __B, __m128h __C, const int __D) { - return __builtin_ia32_vsubsh_v8hf_mask_round (__B, __C, - _mm_setzero_ph (), - __A, __D); + return __builtin_ia32_subsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_round_sh (__m128h __A, __m128h __B, const int __C) { - return __builtin_ia32_vmulsh_v8hf_mask_round (__A, __B, - _mm_setzero_ph (), - (__mmask8) -1, __C); + return __builtin_ia32_mulsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); } extern __inline __m128h @@ -629,7 +782,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_mul_round_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D, const int __E) { - return __builtin_ia32_vmulsh_v8hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_mulsh_mask_round (__C, __D, __A, __B, __E); } extern __inline __m128h @@ -637,18 +790,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_mul_round_sh (__mmask8 __A, __m128h __B, __m128h __C, const int __D) { - return __builtin_ia32_vmulsh_v8hf_mask_round (__B, __C, - _mm_setzero_ph (), - __A, __D); + return __builtin_ia32_mulsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_round_sh (__m128h __A, __m128h __B, const int __C) { - return __builtin_ia32_vdivsh_v8hf_mask_round (__A, __B, - _mm_setzero_ph (), - (__mmask8) -1, __C); + return __builtin_ia32_divsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); } extern __inline __m128h @@ -656,7 +809,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_div_round_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D, const int __E) { - return __builtin_ia32_vdivsh_v8hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_divsh_mask_round (__C, __D, __A, __B, __E); } extern __inline __m128h @@ -664,62 +817,62 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_div_round_sh (__mmask8 __A, __m128h __B, __m128h __C, const int __D) { - return __builtin_ia32_vdivsh_v8hf_mask_round (__B, __C, - _mm_setzero_ph (), - __A, __D); + return __builtin_ia32_divsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); } #else #define _mm_add_round_sh(A, B, C) \ - ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((A), (B), \ - _mm_setzero_ph (), \ - (__mmask8)-1, (C))) + ((__m128h)__builtin_ia32_addsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) #define _mm_mask_add_round_sh(A, B, C, D, E) \ - ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((C), (D), (A), (B), (E))) + ((__m128h)__builtin_ia32_addsh_mask_round ((C), (D), (A), (B), (E))) -#define _mm_maskz_add_round_sh(A, B, C, D) \ - ((__m128h)__builtin_ia32_vaddsh_v8hf_mask_round ((B), (C), \ - _mm_setzero_ph (), \ - (A), (D))) +#define _mm_maskz_add_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_addsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) #define _mm_sub_round_sh(A, B, C) \ - ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((A), (B), \ - _mm_setzero_ph (), \ - (__mmask8)-1, (C))) + ((__m128h)__builtin_ia32_subsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) #define _mm_mask_sub_round_sh(A, B, C, D, E) \ - ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((C), (D), (A), (B), (E))) + ((__m128h)__builtin_ia32_subsh_mask_round ((C), (D), (A), (B), (E))) -#define _mm_maskz_sub_round_sh(A, B, C, D) \ - ((__m128h)__builtin_ia32_vsubsh_v8hf_mask_round ((B), (C), \ - _mm_setzero_ph (), \ - (A), (D))) +#define _mm_maskz_sub_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_subsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) #define _mm_mul_round_sh(A, B, C) \ - ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((A), (B), \ - _mm_setzero_ph (), \ - (__mmask8)-1, (C))) + ((__m128h)__builtin_ia32_mulsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) #define _mm_mask_mul_round_sh(A, B, C, D, E) \ - ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((C), (D), (A), (B), (E))) + ((__m128h)__builtin_ia32_mulsh_mask_round ((C), (D), (A), (B), (E))) -#define _mm_maskz_mul_round_sh(A, B, C, D) \ - ((__m128h)__builtin_ia32_vmulsh_v8hf_mask_round ((B), (C), \ - _mm_setzero_ph (), \ - (A), (D))) +#define _mm_maskz_mul_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_mulsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) #define _mm_div_round_sh(A, B, C) \ - ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((A), (B), \ - _mm_setzero_ph (), \ - (__mmask8)-1, (C))) + ((__m128h)__builtin_ia32_divsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) #define _mm_mask_div_round_sh(A, B, C, D, E) \ - ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((C), (D), (A), (B), (E))) + ((__m128h)__builtin_ia32_divsh_mask_round ((C), (D), (A), (B), (E))) -#define _mm_maskz_div_round_sh(A, B, C, D) \ - ((__m128h)__builtin_ia32_vdivsh_v8hf_mask_round ((B), (C), \ - _mm_setzero_ph (), \ - (A), (D))) +#define _mm_maskz_div_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_divsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) #endif /* __OPTIMIZE__ */ /* Intrinsic vmaxph vminph. */ @@ -727,48 +880,48 @@ extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_max_ph (__m512h __A, __m512h __B) { - return __builtin_ia32_vmaxph_v32hf_mask (__A, __B, - _mm512_setzero_ph (), - (__mmask32) -1); + return __builtin_ia32_maxph512_mask (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_max_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) { - return __builtin_ia32_vmaxph_v32hf_mask (__C, __D, __A, __B); + return __builtin_ia32_maxph512_mask (__C, __D, __A, __B); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_max_ph (__mmask32 __A, __m512h __B, __m512h __C) { - return __builtin_ia32_vmaxph_v32hf_mask (__B, __C, - _mm512_setzero_ph (), __A); + return __builtin_ia32_maxph512_mask (__B, __C, + _mm512_setzero_ph (), __A); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_min_ph (__m512h __A, __m512h __B) { - return __builtin_ia32_vminph_v32hf_mask (__A, __B, - _mm512_setzero_ph (), - (__mmask32) -1); + return __builtin_ia32_minph512_mask (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_min_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) { - return __builtin_ia32_vminph_v32hf_mask (__C, __D, __A, __B); + return __builtin_ia32_minph512_mask (__C, __D, __A, __B); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_min_ph (__mmask32 __A, __m512h __B, __m512h __C) { - return __builtin_ia32_vminph_v32hf_mask (__B, __C, - _mm512_setzero_ph (), __A); + return __builtin_ia32_minph512_mask (__B, __C, + _mm512_setzero_ph (), __A); } #ifdef __OPTIMIZE__ @@ -776,9 +929,9 @@ extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_max_round_ph (__m512h __A, __m512h __B, const int __C) { - return __builtin_ia32_vmaxph_v32hf_mask_round (__A, __B, - _mm512_setzero_ph (), - (__mmask32) -1, __C); + return __builtin_ia32_maxph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); } extern __inline __m512h @@ -786,7 +939,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_max_round_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D, const int __E) { - return __builtin_ia32_vmaxph_v32hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_maxph512_mask_round (__C, __D, __A, __B, __E); } extern __inline __m512h @@ -794,18 +947,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_max_round_ph (__mmask32 __A, __m512h __B, __m512h __C, const int __D) { - return __builtin_ia32_vmaxph_v32hf_mask_round (__B, __C, - _mm512_setzero_ph (), - __A, __D); + return __builtin_ia32_maxph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); } extern __inline __m512h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_min_round_ph (__m512h __A, __m512h __B, const int __C) { - return __builtin_ia32_vminph_v32hf_mask_round (__A, __B, - _mm512_setzero_ph (), - (__mmask32) -1, __C); + return __builtin_ia32_minph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); } extern __inline __m512h @@ -813,7 +966,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_min_round_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D, const int __E) { - return __builtin_ia32_vminph_v32hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_minph512_mask_round (__C, __D, __A, __B, __E); } extern __inline __m512h @@ -821,37 +974,37 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_min_round_ph (__mmask32 __A, __m512h __B, __m512h __C, const int __D) { - return __builtin_ia32_vminph_v32hf_mask_round (__B, __C, - _mm512_setzero_ph (), - __A, __D); + return __builtin_ia32_minph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); } #else -#define _mm512_max_round_ph(A, B, C) \ - (__builtin_ia32_vmaxph_v32hf_mask_round ((A), (B), \ - _mm512_setzero_ph (), \ - (__mmask32)-1, (C))) +#define _mm512_max_round_ph(A, B, C) \ + (__builtin_ia32_maxph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) #define _mm512_mask_max_round_ph(A, B, C, D, E) \ - (__builtin_ia32_vmaxph_v32hf_mask_round ((C), (D), (A), (B), (E))) + (__builtin_ia32_maxph512_mask_round ((C), (D), (A), (B), (E))) -#define _mm512_maskz_max_round_ph(A, B, C, D) \ - (__builtin_ia32_vmaxph_v32hf_mask_round ((B), (C), \ - _mm512_setzero_ph (), \ - (A), (D))) +#define _mm512_maskz_max_round_ph(A, B, C, D) \ + (__builtin_ia32_maxph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) -#define _mm512_min_round_ph(A, B, C) \ - (__builtin_ia32_vminph_v32hf_mask_round ((A), (B), \ - _mm512_setzero_ph (), \ - (__mmask32)-1, (C))) +#define _mm512_min_round_ph(A, B, C) \ + (__builtin_ia32_minph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) #define _mm512_mask_min_round_ph(A, B, C, D, E) \ - (__builtin_ia32_vminph_v32hf_mask_round ((C), (D), (A), (B), (E))) + (__builtin_ia32_minph512_mask_round ((C), (D), (A), (B), (E))) -#define _mm512_maskz_min_round_ph(A, B, C, D) \ - (__builtin_ia32_vminph_v32hf_mask_round ((B), (C), \ - _mm512_setzero_ph (), \ - (A), (D))) +#define _mm512_maskz_min_round_ph(A, B, C, D) \ + (__builtin_ia32_minph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) #endif /* __OPTIMIZE__ */ /* Intrinsic vmaxsh vminsh. */ @@ -867,15 +1020,15 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_max_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vmaxsh_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_maxsh_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_max_sh (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vmaxsh_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_maxsh_mask (__B, __C, _mm_setzero_ph (), + __A); } extern __inline __m128h @@ -890,15 +1043,15 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_min_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vminsh_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_minsh_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_min_sh (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vminsh_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_minsh_mask (__B, __C, _mm_setzero_ph (), + __A); } #ifdef __OPTIMIZE__ @@ -906,9 +1059,9 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_round_sh (__m128h __A, __m128h __B, const int __C) { - return __builtin_ia32_vmaxsh_v8hf_mask_round (__A, __B, - _mm_setzero_ph (), - (__mmask8) -1, __C); + return __builtin_ia32_maxsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); } extern __inline __m128h @@ -916,7 +1069,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_max_round_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D, const int __E) { - return __builtin_ia32_vmaxsh_v8hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_maxsh_mask_round (__C, __D, __A, __B, __E); } extern __inline __m128h @@ -924,18 +1077,18 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_max_round_sh (__mmask8 __A, __m128h __B, __m128h __C, const int __D) { - return __builtin_ia32_vmaxsh_v8hf_mask_round (__B, __C, - _mm_setzero_ph (), - __A, __D); + return __builtin_ia32_maxsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_round_sh (__m128h __A, __m128h __B, const int __C) { - return __builtin_ia32_vminsh_v8hf_mask_round (__A, __B, - _mm_setzero_ph (), - (__mmask8) -1, __C); + return __builtin_ia32_minsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); } extern __inline __m128h @@ -943,7 +1096,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_min_round_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D, const int __E) { - return __builtin_ia32_vminsh_v8hf_mask_round (__C, __D, __A, __B, __E); + return __builtin_ia32_minsh_mask_round (__C, __D, __A, __B, __E); } extern __inline __m128h @@ -951,37 +1104,37 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_min_round_sh (__mmask8 __A, __m128h __B, __m128h __C, const int __D) { - return __builtin_ia32_vminsh_v8hf_mask_round (__B, __C, - _mm_setzero_ph (), - __A, __D); + return __builtin_ia32_minsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); } #else -#define _mm_max_round_sh(A, B, C) \ - (__builtin_ia32_vmaxsh_v8hf_mask_round ((A), (B), \ - _mm_setzero_ph (), \ - (__mmask8)-1, (C))) +#define _mm_max_round_sh(A, B, C) \ + (__builtin_ia32_maxsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) -#define _mm_mask_max_round_sh(A, B, C, D, E) \ - (__builtin_ia32_vmaxsh_v8hf_mask_round ((C), (D), (A), (B), (E))) +#define _mm_mask_max_round_sh(A, B, C, D, E) \ + (__builtin_ia32_maxsh_mask_round ((C), (D), (A), (B), (E))) -#define _mm_maskz_max_round_sh(A, B, C, D) \ - (__builtin_ia32_vmaxsh_v8hf_mask_round ((B), (C), \ - _mm_setzero_ph (), \ - (A), (D))) +#define _mm_maskz_max_round_sh(A, B, C, D) \ + (__builtin_ia32_maxsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) -#define _mm_min_round_sh(A, B, C) \ - (__builtin_ia32_vminsh_v8hf_mask_round ((A), (B), \ - _mm_setzero_ph (), \ - (__mmask8)-1, (C))) +#define _mm_min_round_sh(A, B, C) \ + (__builtin_ia32_minsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) -#define _mm_mask_min_round_sh(A, B, C, D, E) \ - (__builtin_ia32_vminsh_v8hf_mask_round ((C), (D), (A), (B), (E))) +#define _mm_mask_min_round_sh(A, B, C, D, E) \ + (__builtin_ia32_minsh_mask_round ((C), (D), (A), (B), (E))) -#define _mm_maskz_min_round_sh(A, B, C, D) \ - (__builtin_ia32_vminsh_v8hf_mask_round ((B), (C), \ - _mm_setzero_ph (), \ - (A), (D))) +#define _mm_maskz_min_round_sh(A, B, C, D) \ + (__builtin_ia32_minsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) #endif /* __OPTIMIZE__ */ @@ -991,8 +1144,8 @@ extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cmp_ph_mask (__m512h __A, __m512h __B, const int __C) { - return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask (__A, __B, __C, - (__mmask32) -1); + return (__mmask32) __builtin_ia32_cmpph512_mask (__A, __B, __C, + (__mmask32) -1); } extern __inline __mmask32 @@ -1000,8 +1153,8 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_cmp_ph_mask (__mmask32 __A, __m512h __B, __m512h __C, const int __D) { - return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask (__B, __C, __D, - __A); + return (__mmask32) __builtin_ia32_cmpph512_mask (__B, __C, __D, + __A); } extern __inline __mmask32 @@ -1009,9 +1162,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cmp_round_ph_mask (__m512h __A, __m512h __B, const int __C, const int __D) { - return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask_round (__A, __B, - __C, (__mmask32) -1, - __D); + return (__mmask32) __builtin_ia32_cmpph512_mask_round (__A, __B, + __C, (__mmask32) -1, + __D); } extern __inline __mmask32 @@ -1019,23 +1172,23 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_cmp_round_ph_mask (__mmask32 __A, __m512h __B, __m512h __C, const int __D, const int __E) { - return (__mmask32) __builtin_ia32_vcmpph_v32hf_mask_round (__B, __C, - __D, __A, - __E); + return (__mmask32) __builtin_ia32_cmpph512_mask_round (__B, __C, + __D, __A, + __E); } #else #define _mm512_cmp_ph_mask(A, B, C) \ - (__builtin_ia32_vcmpph_v32hf_mask ((A), (B), (C), (-1))) + (__builtin_ia32_cmpph512_mask ((A), (B), (C), (-1))) #define _mm512_mask_cmp_ph_mask(A, B, C, D) \ - (__builtin_ia32_vcmpph_v32hf_mask ((B), (C), (D), (A))) + (__builtin_ia32_cmpph512_mask ((B), (C), (D), (A))) -#define _mm512_cmp_round_ph_mask(A, B, C, D) \ - (__builtin_ia32_vcmpph_v32hf_mask_round ((A), (B), (C), (-1), (D))) +#define _mm512_cmp_round_ph_mask(A, B, C, D) \ + (__builtin_ia32_cmpph512_mask_round ((A), (B), (C), (-1), (D))) -#define _mm512_mask_cmp_round_ph_mask(A, B, C, D, E) \ - (__builtin_ia32_vcmpph_v32hf_mask_round ((B), (C), (D), (A), (E))) +#define _mm512_mask_cmp_round_ph_mask(A, B, C, D, E) \ + (__builtin_ia32_cmpph512_mask_round ((B), (C), (D), (A), (E))) #endif /* __OPTIMIZE__ */ @@ -1046,9 +1199,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmp_sh_mask (__m128h __A, __m128h __B, const int __C) { return (__mmask8) - __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, - __C, (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + __builtin_ia32_cmpsh_mask_round (__A, __B, + __C, (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline __mmask8 @@ -1057,9 +1210,9 @@ _mm_mask_cmp_sh_mask (__mmask8 __A, __m128h __B, __m128h __C, const int __D) { return (__mmask8) - __builtin_ia32_vcmpsh_v8hf_mask_round (__B, __C, - __D, __A, - _MM_FROUND_CUR_DIRECTION); + __builtin_ia32_cmpsh_mask_round (__B, __C, + __D, __A, + _MM_FROUND_CUR_DIRECTION); } extern __inline __mmask8 @@ -1067,9 +1220,9 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmp_round_sh_mask (__m128h __A, __m128h __B, const int __C, const int __D) { - return (__mmask8) __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, - __C, (__mmask8) -1, - __D); + return (__mmask8) __builtin_ia32_cmpsh_mask_round (__A, __B, + __C, (__mmask8) -1, + __D); } extern __inline __mmask8 @@ -1077,25 +1230,25 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_cmp_round_sh_mask (__mmask8 __A, __m128h __B, __m128h __C, const int __D, const int __E) { - return (__mmask8) __builtin_ia32_vcmpsh_v8hf_mask_round (__B, __C, - __D, __A, - __E); + return (__mmask8) __builtin_ia32_cmpsh_mask_round (__B, __C, + __D, __A, + __E); } #else -#define _mm_cmp_sh_mask(A, B, C) \ - (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (C), (-1), \ - (_MM_FROUND_CUR_DIRECTION))) +#define _mm_cmp_sh_mask(A, B, C) \ + (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1), \ + (_MM_FROUND_CUR_DIRECTION))) -#define _mm_mask_cmp_sh_mask(A, B, C, D) \ - (__builtin_ia32_vcmpsh_v8hf_mask_round ((B), (C), (D), (A), \ - (_MM_FROUND_CUR_DIRECTION))) +#define _mm_mask_cmp_sh_mask(A, B, C, D) \ + (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A), \ + (_MM_FROUND_CUR_DIRECTION))) -#define _mm_cmp_round_sh_mask(A, B, C, D) \ - (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (C), (-1), (D))) +#define _mm_cmp_round_sh_mask(A, B, C, D) \ + (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1), (D))) -#define _mm_mask_cmp_round_sh_mask(A, B, C, D, E) \ - (__builtin_ia32_vcmpsh_v8hf_mask_round ((B), (C), (D), (A), (E))) +#define _mm_mask_cmp_round_sh_mask(A, B, C, D, E) \ + (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A), (E))) #endif /* __OPTIMIZE__ */ @@ -1104,137 +1257,3792 @@ extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_comieq_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_EQ_OS, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_comilt_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LT_OS, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_comile_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LE_OS, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_comigt_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GT_OS, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_comige_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GE_OS, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_comineq_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_NEQ_US, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_US, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomieq_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_EQ_OQ, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomilt_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LT_OQ, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomile_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_LE_OQ, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomigt_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GT_OQ, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomige_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_GE_OQ, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_ucomineq_sh (__m128h __A, __m128h __B) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, _CMP_NEQ_UQ, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_UQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } #ifdef __OPTIMIZE__ extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comi_sh (__m128h __A, __m128h __B, const int __P) +_mm_comi_sh (__m128h __A, __m128h __B, const int __P) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, __P, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cmpsh_mask_round (__A, __B, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_comi_round_sh (__m128h __A, __m128h __B, const int __P, const int __R) { - return __builtin_ia32_vcmpsh_v8hf_mask_round (__A, __B, __P, - (__mmask8) -1,__R); + return __builtin_ia32_cmpsh_mask_round (__A, __B, __P, + (__mmask8) -1,__R); } #else -#define _mm_comi_round_sh(A, B, P, R) \ - (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (P), (__mmask8) (-1), (R))) -#define _mm_comi_sh(A, B, P) \ - (__builtin_ia32_vcmpsh_v8hf_mask_round ((A), (B), (P), (__mmask8) (-1), \ - _MM_FROUND_CUR_DIRECTION)) +#define _mm_comi_round_sh(A, B, P, R) \ + (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1), (R))) +#define _mm_comi_sh(A, B, P) \ + (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1), \ + _MM_FROUND_CUR_DIRECTION)) #endif /* __OPTIMIZE__ */ +/* Intrinsics vsqrtph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_ph (__m512h __A) +{ + return __builtin_ia32_sqrtph512_mask_round (__A, + _mm512_setzero_ph(), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_ph (__m512h __A, __mmask32 __B, __m512h __C) +{ + return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_ph (__mmask32 __A, __m512h __B) +{ + return __builtin_ia32_sqrtph512_mask_round (__B, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_round_ph (__m512h __A, const int __B) +{ + return __builtin_ia32_sqrtph512_mask_round (__A, + _mm512_setzero_ph(), + (__mmask32) -1, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_round_ph (__mmask32 __A, __m512h __B, const int __C) +{ + return __builtin_ia32_sqrtph512_mask_round (__B, + _mm512_setzero_ph (), + __A, __C); +} + +#else +#define _mm512_sqrt_round_ph(A, B) \ + (__builtin_ia32_sqrtph512_mask_round ((A), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (B))) + +#define _mm512_mask_sqrt_round_ph(A, B, C, D) \ + (__builtin_ia32_sqrtph512_mask_round ((C), (A), (B), (D))) + +#define _mm512_maskz_sqrt_round_ph(A, B, C) \ + (__builtin_ia32_sqrtph512_mask_round ((B), \ + _mm512_setzero_ph (), \ + (A), (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vrsqrtph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt_ph (__m512h __A) +{ + return __builtin_ia32_rsqrtph512_mask (__A, _mm512_setzero_ph (), + (__mmask32) -1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt_ph (__m512h __A, __mmask32 __B, __m512h __C) +{ + return __builtin_ia32_rsqrtph512_mask (__C, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt_ph (__mmask32 __A, __m512h __B) +{ + return __builtin_ia32_rsqrtph512_mask (__B, _mm512_setzero_ph (), + __A); +} + +/* Intrinsics vrsqrtsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_rsqrtsh_mask (__B, __A, _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_rsqrtsh_mask (__D, __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_rsqrtsh_mask (__C, __B, _mm_setzero_ph (), + __A); +} + +/* Intrinsics vsqrtsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_sqrtsh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_sqrtsh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_sqrtsh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B, + __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_sqrtsh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, __D); +} + +#else +#define _mm_sqrt_round_sh(A, B, C) \ + (__builtin_ia32_sqrtsh_mask_round ((B), (A), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_sqrt_round_sh(A, B, C, D, E) \ + (__builtin_ia32_sqrtsh_mask_round ((D), (C), (A), (B), (E))) + +#define _mm_maskz_sqrt_round_sh(A, B, C, D) \ + (__builtin_ia32_sqrtsh_mask_round ((C), (B), \ + _mm_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vrcpph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp_ph (__m512h __A) +{ + return __builtin_ia32_rcpph512_mask (__A, _mm512_setzero_ph (), + (__mmask32) -1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp_ph (__m512h __A, __mmask32 __B, __m512h __C) +{ + return __builtin_ia32_rcpph512_mask (__C, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp_ph (__mmask32 __A, __m512h __B) +{ + return __builtin_ia32_rcpph512_mask (__B, _mm512_setzero_ph (), + __A); +} + +/* Intrinsics vrcpsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_rcpsh_mask (__B, __A, _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp_sh (__m128h __A, __mmask32 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_rcpsh_mask (__D, __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp_sh (__mmask32 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_rcpsh_mask (__C, __B, _mm_setzero_ph (), + __A); +} + +/* Intrinsics vscalefph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_ph (__m512h __A, __m512h __B) +{ + return __builtin_ia32_scalefph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_scalefph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_scalefph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B, + __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_scalefph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +#else +#define _mm512_scalef_round_ph(A, B, C) \ + (__builtin_ia32_scalefph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_scalef_round_ph(A, B, C, D, E) \ + (__builtin_ia32_scalefph512_mask_round ((C), (D), (A), (B), (E))) + +#define _mm512_maskz_scalef_round_ph(A, B, C, D) \ + (__builtin_ia32_scalefph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vscalefsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_scalefsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_scalefsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_scalefsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B, + __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_scalefsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +#else +#define _mm_scalef_round_sh(A, B, C) \ + (__builtin_ia32_scalefsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_scalef_round_sh(A, B, C, D, E) \ + (__builtin_ia32_scalefsh_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_scalef_round_sh(A, B, C, D) \ + (__builtin_ia32_scalefsh_mask_round ((B), (C), _mm_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vreduceph. */ +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_ph (__m512h __A, int __B) +{ + return __builtin_ia32_reduceph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_ph (__m512h __A, __mmask32 __B, __m512h __C, int __D) +{ + return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_ph (__mmask32 __A, __m512h __B, int __C) +{ + return __builtin_ia32_reduceph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_round_ph (__m512h __A, int __B, const int __C) +{ + return __builtin_ia32_reduceph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + int __D, const int __E) +{ + return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B, + __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_round_ph (__mmask32 __A, __m512h __B, int __C, + const int __D) +{ + return __builtin_ia32_reduceph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +#else +#define _mm512_reduce_ph(A, B) \ + (__builtin_ia32_reduceph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_reduce_ph(A, B, C, D) \ + (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_reduce_ph(A, B, C) \ + (__builtin_ia32_reduceph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_reduce_round_ph(A, B, C) \ + (__builtin_ia32_reduceph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_reduce_round_ph(A, B, C, D, E) \ + (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B), (E))) + +#define _mm512_maskz_reduce_round_ph(A, B, C, D) \ + (__builtin_ia32_reduceph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vreducesh. */ +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_sh (__m128h __A, __m128h __B, int __C) +{ + return __builtin_ia32_reducesh_mask_round (__A, __B, __C, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, int __E) +{ + return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D) +{ + return __builtin_ia32_reducesh_mask_round (__B, __C, __D, + _mm_setzero_ph (), __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_round_sh (__m128h __A, __m128h __B, int __C, const int __D) +{ + return __builtin_ia32_reducesh_mask_round (__A, __B, __C, + _mm_setzero_ph (), + (__mmask8) -1, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, int __E, const int __F) +{ + return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A, + __B, __F); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + int __D, const int __E) +{ + return __builtin_ia32_reducesh_mask_round (__B, __C, __D, + _mm_setzero_ph (), + __A, __E); +} + +#else +#define _mm_reduce_sh(A, B, C) \ + (__builtin_ia32_reducesh_mask_round ((A), (B), (C), \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_reduce_sh(A, B, C, D, E) \ + (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_reduce_sh(A, B, C, D) \ + (__builtin_ia32_reducesh_mask_round ((B), (C), (D), \ + _mm_setzero_ph (), \ + (A), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_reduce_round_sh(A, B, C, D) \ + (__builtin_ia32_reducesh_mask_round ((A), (B), (C), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (D))) + +#define _mm_mask_reduce_round_sh(A, B, C, D, E, F) \ + (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B), (F))) + +#define _mm_maskz_reduce_round_sh(A, B, C, D, E) \ + (__builtin_ia32_reducesh_mask_round ((B), (C), (D), \ + _mm_setzero_ph (), \ + (A), (E))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vrndscaleph. */ +#ifdef __OPTIMIZE__ +extern __inline __m512h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_ph (__m512h __A, int __B) +{ + return __builtin_ia32_rndscaleph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_ph (__m512h __A, __mmask32 __B, + __m512h __C, int __D) +{ + return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_ph (__mmask32 __A, __m512h __B, int __C) +{ + return __builtin_ia32_rndscaleph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_round_ph (__m512h __A, int __B, const int __C) +{ + return __builtin_ia32_rndscaleph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, + __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_round_ph (__m512h __A, __mmask32 __B, + __m512h __C, int __D, const int __E) +{ + return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A, + __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_round_ph (__mmask32 __A, __m512h __B, int __C, + const int __D) +{ + return __builtin_ia32_rndscaleph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +#else +#define _mm512_roundscale_ph(A, B) \ + (__builtin_ia32_rndscaleph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_roundscale_ph(A, B, C, D) \ + (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_roundscale_ph(A, B, C) \ + (__builtin_ia32_rndscaleph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm512_roundscale_round_ph(A, B, C) \ + (__builtin_ia32_rndscaleph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_roundscale_round_ph(A, B, C, D, E) \ + (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B), (E))) + +#define _mm512_maskz_roundscale_round_ph(A, B, C, D) \ + (__builtin_ia32_rndscaleph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vrndscalesh. */ +#ifdef __OPTIMIZE__ +extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_sh (__m128h __A, __m128h __B, int __C) +{ + return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, int __E) +{ + return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D) +{ + return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D, + _mm_setzero_ph (), __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_round_sh (__m128h __A, __m128h __B, int __C, const int __D) +{ + return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C, + _mm_setzero_ph (), + (__mmask8) -1, + __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, int __E, const int __F) +{ + return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E, + __A, __B, __F); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + int __D, const int __E) +{ + return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D, + _mm_setzero_ph (), + __A, __E); +} + +#else +#define _mm_roundscale_sh(A, B, C) \ + (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C), \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_roundscale_sh(A, B, C, D, E) \ + (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_roundscale_sh(A, B, C, D) \ + (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D), \ + _mm_setzero_ph (), \ + (A), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_roundscale_round_sh(A, B, C, D) \ + (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (D))) + +#define _mm_mask_roundscale_round_sh(A, B, C, D, E, F) \ + (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B), (F))) + +#define _mm_maskz_roundscale_round_sh(A, B, C, D, E) \ + (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D), \ + _mm_setzero_ph (), \ + (A), (E))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfpclasssh. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_sh_mask (__m128h __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_sh_mask (__mmask8 __U, __m128h __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm, __U); +} + +#else +#define _mm_fpclass_sh_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X), \ + (int) (C), (__mmask8) (-1))) \ + +#define _mm_mask_fpclass_sh_mask(U, X, C) \ + ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X), \ + (int) (C), (__mmask8) (U))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfpclassph. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fpclass_ph_mask (__mmask32 __U, __m512h __A, + const int __imm) +{ + return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A, + __imm, __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fpclass_ph_mask (__m512h __A, const int __imm) +{ + return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A, + __imm, + (__mmask32) -1); +} + +#else +#define _mm512_mask_fpclass_ph_mask(u, x, c) \ + ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \ + (int) (c),(__mmask8)(u))) + +#define _mm512_fpclass_ph_mask(x, c) \ + ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \ + (int) (c),(__mmask8)-1)) +#endif /* __OPIMTIZE__ */ + +/* Intrinsics vgetexpph, vgetexpsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_sh (__m128h __A, __m128h __B) +{ + return (__m128h) + __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__v8hf) _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) +{ + return (__m128h) + __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__v8hf) __W, (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_sh (__mmask8 __U, __m128h __A, __m128h __B) +{ + return (__m128h) + __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__v8hf) _mm_setzero_ph (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_ph (__m512h __A) +{ + return (__m512h) + __builtin_ia32_getexpph512_mask ((__v32hf) __A, + (__v32hf) _mm512_setzero_ph (), + (__mmask32) -1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_ph (__m512h __W, __mmask32 __U, __m512h __A) +{ + return (__m512h) + __builtin_ia32_getexpph512_mask ((__v32hf) __A, (__v32hf) __W, + (__mmask32) __U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_ph (__mmask32 __U, __m512h __A) +{ + return (__m512h) + __builtin_ia32_getexpph512_mask ((__v32hf) __A, + (__v32hf) _mm512_setzero_ph (), + (__mmask32) __U, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_round_sh (__m128h __A, __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + _mm_setzero_ph (), + (__mmask8) -1, + __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_round_sh (__m128h __W, __mmask8 __U, __m128h __A, + __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_round_sh (__mmask8 __U, __m128h __A, __m128h __B, + const int __R) +{ + return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) + _mm_setzero_ph (), + (__mmask8) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_round_ph (__m512h __A, const int __R) +{ + return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A, + (__v32hf) + _mm512_setzero_ph (), + (__mmask32) -1, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_round_ph (__m512h __W, __mmask32 __U, __m512h __A, + const int __R) +{ + return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A, + (__v32hf) __W, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_round_ph (__mmask32 __U, __m512h __A, const int __R) +{ + return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A, + (__v32hf) + _mm512_setzero_ph (), + (__mmask32) __U, __R); +} + +#else +#define _mm_getexp_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_getexpsh_mask_round((__v8hf)(__m128h)(A), \ + (__v8hf)(__m128h)(B), \ + (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, R)) + +#define _mm_mask_getexp_round_sh(W, U, A, B, C) \ + (__m128h)__builtin_ia32_getexpsh_mask_round(A, B, W, U, C) + +#define _mm_maskz_getexp_round_sh(U, A, B, C) \ + (__m128h)__builtin_ia32_getexpsh_mask_round(A, B, \ + (__v8hf)_mm_setzero_ph(), \ + U, C) + +#define _mm512_getexp_round_ph(A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, R)) + +#define _mm512_mask_getexp_round_ph(W, U, A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(W), (__mmask32)(U), R)) + +#define _mm512_maskz_getexp_round_ph(U, A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), R)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vgetmantph, vgetmantsh. */ +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_sh (__m128h __A, __m128h __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128h) + __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__D << 2) | __C, _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_sh (__m128h __W, __mmask8 __U, __m128h __A, + __m128h __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128h) + __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__D << 2) | __C, (__v8hf) __W, + __U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_sh (__mmask8 __U, __m128h __A, __m128h __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128h) + __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__D << 2) | __C, + (__v8hf) _mm_setzero_ph(), + __U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_ph (__m512h __W, __mmask32 __U, __m512h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + (__v32hf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_ph (__mmask32 __U, __m512h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + (__v32hf) + _mm512_setzero_ph (), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_round_sh (__m128h __A, __m128h __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__D << 2) | __C, + _mm_setzero_ph (), + (__mmask8) -1, + __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_round_sh (__m128h __W, __mmask8 __U, __m128h __A, + __m128h __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__D << 2) | __C, + (__v8hf) __W, + __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_round_sh (__mmask8 __U, __m128h __A, __m128h __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__D << 2) | __C, + (__v8hf) + _mm_setzero_ph(), + __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_round_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + _mm512_setzero_ph (), + (__mmask32) -1, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_round_ph (__m512h __W, __mmask32 __U, __m512h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + (__v32hf) __W, __U, + __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_round_ph (__mmask32 __U, __m512h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + (__v32hf) + _mm512_setzero_ph (), + __U, __R); +} + +#else +#define _mm512_getmant_ph(X, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h) \ + _mm512_setzero_ph(), \ + (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_ph(W, U, X, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h)(W), \ + (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + + +#define _mm512_maskz_getmant_ph(U, X, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h) \ + _mm512_setzero_ph(), \ + (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getmant_sh(X, Y, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h) \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_sh(W, U, X, Y, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h)(W), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_sh(U, X, Y, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h) \ + _mm_setzero_ph(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_getmant_round_ph(X, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h) \ + _mm512_setzero_ph(), \ + (__mmask32)-1, \ + (R))) + +#define _mm512_mask_getmant_round_ph(W, U, X, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h)(W), \ + (__mmask32)(U), \ + (R))) + + +#define _mm512_maskz_getmant_round_ph(U, X, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h) \ + _mm512_setzero_ph(), \ + (__mmask32)(U), \ + (R))) + +#define _mm_getmant_round_sh(X, Y, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h) \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + (R))) + +#define _mm_mask_getmant_round_sh(W, U, X, Y, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h)(W), \ + (__mmask8)(U), \ + (R))) + +#define _mm_maskz_getmant_round_sh(U, X, Y, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h) \ + _mm_setzero_ph(), \ + (__mmask8)(U), \ + (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vmovw. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi16_si128 (short __A) +{ + return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si16 (__m128i __A) +{ + return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, 0); +} + +/* Intrinsics vmovsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_sh (__m128h __A, __mmask8 __B, _Float16 const* __C) +{ + return __builtin_ia32_loadsh_mask (__C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_sh (__mmask8 __A, _Float16 const* __B) +{ + return __builtin_ia32_loadsh_mask (__B, _mm_setzero_ph (), __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C) +{ + __builtin_ia32_storesh_mask (__A, __C, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_sh (__m128h __A, __m128h __B) +{ + __A[0] = __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_move_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vmovsh_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_move_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A); +} + +/* Intrinsics vcvtph2dq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epi32 (__m256h __A) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epi32 (__m512i __A, __mmask16 __B, __m256h __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__C, + (__v16si) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epi32 (__mmask16 __A, __m256h __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epi32 (__m256h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epi32 (__m512i __A, __mmask16 __B, __m256h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__C, + (__v16si) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epi32(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvtph2dq512_mask_round ((A), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (__mmask16)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_epi32(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvtph2dq512_mask_round ((C), (__v16si)(A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_epi32(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvtph2dq512_mask_round ((B), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2udq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epu32 (__m256h __A) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epu32 (__m512i __A, __mmask16 __B, __m256h __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__C, + (__v16si) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epu32 (__mmask16 __A, __m256h __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epu32 (__m256h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epu32 (__m512i __A, __mmask16 __B, __m256h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__C, + (__v16si) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epu32(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvtph2udq512_mask_round ((A), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (__mmask16)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_epu32(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvtph2udq512_mask_round ((C), (__v16si)(A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_epu32(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvtph2udq512_mask_round ((B), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2dq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epi32 (__m256h __A) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epi32 (__m512i __A, __mmask16 __B, __m256h __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__C, + (__v16si) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epi32 (__mmask16 __A, __m256h __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epi32 (__m256h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epi32 (__m512i __A, __mmask16 __B, + __m256h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__C, + (__v16si) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epi32(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvttph2dq512_mask_round ((A), \ + (__v16si) \ + (_mm512_setzero_si512 ()), \ + (__mmask16)(-1), (B))) + +#define _mm512_mask_cvtt_roundph_epi32(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvttph2dq512_mask_round ((C), \ + (__v16si)(A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvtt_roundph_epi32(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvttph2dq512_mask_round ((B), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2udq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epu32 (__m256h __A) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epu32 (__m512i __A, __mmask16 __B, __m256h __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__C, + (__v16si) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epu32 (__mmask16 __A, __m256h __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epu32 (__m256h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epu32 (__m512i __A, __mmask16 __B, + __m256h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__C, + (__v16si) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epu32(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvttph2udq512_mask_round ((A), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (__mmask16)-1, \ + (B))) + +#define _mm512_mask_cvtt_roundph_epu32(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvttph2udq512_mask_round ((C), \ + (__v16si)(A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvtt_roundph_epu32(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvttph2udq512_mask_round ((B), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtdq2ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_ph (__m512i __A) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_ph (__m256h __A, __mmask16 __B, __m512i __C) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_ph (__mmask16 __A, __m512i __B) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B, + _mm256_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi32_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C, + __A, + __B, + __D); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi32_ph (__mmask16 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B, + _mm256_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepi32_ph(A, B) \ + (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(A), \ + _mm256_setzero_ph (), \ + (__mmask16)-1, \ + (B))) + +#define _mm512_mask_cvt_roundepi32_ph(A, B, C, D) \ + (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(C), \ + (A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvt_roundepi32_ph(A, B, C) \ + (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(B), \ + _mm256_setzero_ph (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtudq2ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_ph (__m512i __A) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_ph (__m256h __A, __mmask16 __B, __m512i __C) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_ph (__mmask16 __A, __m512i __B) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B, + _mm256_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu32_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C, + __A, + __B, + __D); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu32_ph (__mmask16 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B, + _mm256_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepu32_ph(A, B) \ + (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)(A), \ + _mm256_setzero_ph (), \ + (__mmask16)-1, \ + B)) + +#define _mm512_mask_cvt_roundepu32_ph(A, B, C, D) \ + (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)C, \ + A, \ + B, \ + D)) + +#define _mm512_maskz_cvt_roundepu32_ph(A, B, C) \ + (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)B, \ + _mm256_setzero_ph (), \ + A, \ + C)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2qq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epi64 (__m128h __A) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epi64 (__m512i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epi64 (__m128h __A, int __B) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epi64(A, B) \ + (__builtin_ia32_vcvtph2qq512_mask_round ((A), \ + _mm512_setzero_si512 (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_epi64(A, B, C, D) \ + (__builtin_ia32_vcvtph2qq512_mask_round ((C), (A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_epi64(A, B, C) \ + (__builtin_ia32_vcvtph2qq512_mask_round ((B), \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2uqq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epu64 (__m512i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epu64 (__m128h __A, int __B) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epu64(A, B) \ + (__builtin_ia32_vcvtph2uqq512_mask_round ((A), \ + _mm512_setzero_si512 (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_epu64(A, B, C, D) \ + (__builtin_ia32_vcvtph2uqq512_mask_round ((C), (A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_epu64(A, B, C) \ + (__builtin_ia32_vcvtph2uqq512_mask_round ((B), \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2qq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epi64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epi64 (__m512i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epi64 (__m128h __A, int __B) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epi64(A, B) \ + (__builtin_ia32_vcvttph2qq512_mask_round ((A), \ + _mm512_setzero_si512 (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvtt_roundph_epi64(A, B, C, D) \ + __builtin_ia32_vcvttph2qq512_mask_round ((C), (A), (B), (D)) + +#define _mm512_maskz_cvtt_roundph_epi64(A, B, C) \ + (__builtin_ia32_vcvttph2qq512_mask_round ((B), \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2uqq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epu64 (__m512i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epu64 (__m128h __A, int __B) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epu64(A, B) \ + (__builtin_ia32_vcvttph2uqq512_mask_round ((A), \ + _mm512_setzero_si512 (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvtt_roundph_epu64(A, B, C, D) \ + __builtin_ia32_vcvttph2uqq512_mask_round ((C), (A), (B), (D)) + +#define _mm512_maskz_cvtt_roundph_epu64(A, B, C) \ + (__builtin_ia32_vcvttph2uqq512_mask_round ((B), \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtqq2ph. */ +extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_ph (__m512i __A) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m512i __C) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_ph (__mmask8 __A, __m512i __B) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B, + _mm_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi64_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A, + _mm_setzero_ph (), + (__mmask8) -1, + __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C, + __A, + __B, + __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi64_ph (__mmask8 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B, + _mm_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepi64_ph(A, B) \ + (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(A), \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvt_roundepi64_ph(A, B, C, D) \ + (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(C), (A), (B), (D))) + +#define _mm512_maskz_cvt_roundepi64_ph(A, B, C) \ + (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(B), \ + _mm_setzero_ph (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtuqq2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu64_ph (__m512i __A) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m512i __C) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu64_ph (__mmask8 __A, __m512i __B) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B, + _mm_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu64_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A, + _mm_setzero_ph (), + (__mmask8) -1, + __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C, + __A, + __B, + __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu64_ph (__mmask8 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B, + _mm_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepu64_ph(A, B) \ + (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(A), \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvt_roundepu64_ph(A, B, C, D) \ + (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(C), (A), (B), (D))) + +#define _mm512_maskz_cvt_roundepu64_ph(A, B, C) \ + (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(B), \ + _mm_setzero_ph (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2w. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epi16 (__m512h __A) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epi16 (__m512i __A, __mmask32 __B, __m512h __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__C, + (__v32hi) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epi16 (__mmask32 __A, __m512h __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epi16 (__m512h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epi16 (__m512i __A, __mmask32 __B, __m512h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__C, + (__v32hi) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epi16(A, B) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((A), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_epi16(A, B, C, D) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((C), \ + (__v32hi)(A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvt_roundph_epi16(A, B, C) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((B), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2uw. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epu16 (__m512h __A) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epu16 (__m512i __A, __mmask32 __B, __m512h __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epu16 (__mmask32 __A, __m512h __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epu16 (__m512h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epu16 (__m512i __A, __mmask32 __B, __m512h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B, __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epu16(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvtph2uw512_mask_round ((A), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1, (B))) + +#define _mm512_mask_cvt_roundph_epu16(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvtph2uw512_mask_round ((C), (__v32hi)(A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_epu16(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvtph2uw512_mask_round ((B), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2w. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epi16 (__m512h __A) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epi16 (__m512i __A, __mmask32 __B, __m512h __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__C, + (__v32hi) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epi16 (__mmask32 __A, __m512h __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epi16 (__m512h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epi16 (__m512i __A, __mmask32 __B, + __m512h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__C, + (__v32hi) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epi16(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvttph2w512_mask_round ((A), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1, \ + (B))) + +#define _mm512_mask_cvtt_roundph_epi16(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvttph2w512_mask_round ((C), \ + (__v32hi)(A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvtt_roundph_epi16(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvttph2w512_mask_round ((B), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2uw. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epu16 (__m512h __A) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epu16 (__m512i __A, __mmask32 __B, __m512h __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__C, + (__v32hi) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epu16 (__mmask32 __A, __m512h __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epu16 (__m512h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epu16 (__m512i __A, __mmask32 __B, + __m512h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__C, + (__v32hi) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epu16(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvttph2uw512_mask_round ((A), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1, \ + (B))) + +#define _mm512_mask_cvtt_roundph_epu16(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvttph2uw512_mask_round ((C), \ + (__v32hi)(A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvtt_roundph_epu16(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvttph2uw512_mask_round ((B), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtw2ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_ph (__m512i __A) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_ph (__m512h __A, __mmask32 __B, __m512i __C) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_ph (__mmask32 __A, __m512i __B) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi16_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A, + _mm512_setzero_ph (), + (__mmask32) -1, + __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C, + __A, + __B, + __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi16_ph (__mmask32 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B, + _mm512_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepi16_ph(A, B) \ + (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(A), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, \ + (B))) + +#define _mm512_mask_cvt_roundepi16_ph(A, B, C, D) \ + (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(C), \ + (A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvt_roundepi16_ph(A, B, C) \ + (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(B), \ + _mm512_setzero_ph (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtuw2ph. */ + extern __inline __m512h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu16_ph (__m512i __A) + { + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); + } + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu16_ph (__m512h __A, __mmask32 __B, __m512i __C) +{ + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu16_ph (__mmask32 __A, __m512i __B) +{ + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu16_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A, + _mm512_setzero_ph (), + (__mmask32) -1, + __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C, + __A, + __B, + __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu16_ph (__mmask32 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B, + _mm512_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepu16_ph(A, B) \ + (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(A), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, \ + (B))) + +#define _mm512_mask_cvt_roundepu16_ph(A, B, C, D) \ + (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(C), \ + (A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvt_roundepu16_ph(A, B, C) \ + (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(B), \ + _mm512_setzero_ph (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtsh2si, vcvtsh2us. */ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_i32 (__m128h __A) +{ + return (int) __builtin_ia32_vcvtsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_u32 (__m128h __A) +{ + return (int) __builtin_ia32_vcvtsh2usi32_round (__A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_i32 (__m128h __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsh2si32_round (__A, __R); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_u32 (__m128h __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsh2usi32_round (__A, __R); +} + +#else +#define _mm_cvt_roundsh_i32(A, B) \ + ((int)__builtin_ia32_vcvtsh2si32_round ((A), (B))) +#define _mm_cvt_roundsh_u32(A, B) \ + ((int)__builtin_ia32_vcvtsh2usi32_round ((A), (B))) + +#endif /* __OPTIMIZE__ */ + +#ifdef __x86_64__ +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_i64 (__m128h __A) +{ + return (long long) + __builtin_ia32_vcvtsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_u64 (__m128h __A) +{ + return (long long) + __builtin_ia32_vcvtsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_i64 (__m128h __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsh2si64_round (__A, __R); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_u64 (__m128h __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsh2usi64_round (__A, __R); +} + +#else +#define _mm_cvt_roundsh_i64(A, B) \ + ((long long)__builtin_ia32_vcvtsh2si64_round ((A), (B))) +#define _mm_cvt_roundsh_u64(A, B) \ + ((long long)__builtin_ia32_vcvtsh2usi64_round ((A), (B))) + +#endif /* __OPTIMIZE__ */ +#endif /* __x86_64__ */ + +/* Intrinsics vcvttsh2si, vcvttsh2us. */ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsh_i32 (__m128h __A) +{ + return (int) + __builtin_ia32_vcvttsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsh_u32 (__m128h __A) +{ + return (int) + __builtin_ia32_vcvttsh2usi32_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsh_i32 (__m128h __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsh2si32_round (__A, __R); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsh_u32 (__m128h __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsh2usi32_round (__A, __R); +} + +#else +#define _mm_cvtt_roundsh_i32(A, B) \ + ((int)__builtin_ia32_vcvttsh2si32_round ((A), (B))) +#define _mm_cvtt_roundsh_u32(A, B) \ + ((int)__builtin_ia32_vcvttsh2usi32_round ((A), (B))) + +#endif /* __OPTIMIZE__ */ + +#ifdef __x86_64__ +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsh_i64 (__m128h __A) +{ + return (long long) + __builtin_ia32_vcvttsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsh_u64 (__m128h __A) +{ + return (long long) + __builtin_ia32_vcvttsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsh_i64 (__m128h __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsh2si64_round (__A, __R); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsh_u64 (__m128h __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsh2usi64_round (__A, __R); +} + +#else +#define _mm_cvtt_roundsh_i64(A, B) \ + ((long long)__builtin_ia32_vcvttsh2si64_round ((A), (B))) +#define _mm_cvtt_roundsh_u64(A, B) \ + ((long long)__builtin_ia32_vcvttsh2usi64_round ((A), (B))) + +#endif /* __OPTIMIZE__ */ +#endif /* __x86_64__ */ + +/* Intrinsics vcvtsi2sh, vcvtusi2sh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_sh (__m128h __A, int __B) +{ + return __builtin_ia32_vcvtsi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu32_sh (__m128h __A, unsigned int __B) +{ + return __builtin_ia32_vcvtusi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi32_sh (__m128h __A, int __B, const int __R) +{ + return __builtin_ia32_vcvtsi2sh32_round (__A, __B, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu32_sh (__m128h __A, unsigned int __B, const int __R) +{ + return __builtin_ia32_vcvtusi2sh32_round (__A, __B, __R); +} + +#else +#define _mm_cvt_roundi32_sh(A, B, C) \ + (__builtin_ia32_vcvtsi2sh32_round ((A), (B), (C))) +#define _mm_cvt_roundu32_sh(A, B, C) \ + (__builtin_ia32_vcvtusi2sh32_round ((A), (B), (C))) + +#endif /* __OPTIMIZE__ */ + +#ifdef __x86_64__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_sh (__m128h __A, long long __B) +{ + return __builtin_ia32_vcvtsi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu64_sh (__m128h __A, unsigned long long __B) +{ + return __builtin_ia32_vcvtusi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi64_sh (__m128h __A, long long __B, const int __R) +{ + return __builtin_ia32_vcvtsi2sh64_round (__A, __B, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu64_sh (__m128h __A, unsigned long long __B, const int __R) +{ + return __builtin_ia32_vcvtusi2sh64_round (__A, __B, __R); +} + +#else +#define _mm_cvt_roundi64_sh(A, B, C) \ + (__builtin_ia32_vcvtsi2sh64_round ((A), (B), (C))) +#define _mm_cvt_roundu64_sh(A, B, C) \ + (__builtin_ia32_vcvtusi2sh64_round ((A), (B), (C))) + +#endif /* __OPTIMIZE__ */ +#endif /* __x86_64__ */ + +/* Intrinsics vcvtph2pd. */ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_pd (__m128h __A) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__A, + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_pd (__m512d __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_pd (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__B, + _mm512_setzero_pd (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_pd (__m128h __A, int __B) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__A, + _mm512_setzero_pd (), + (__mmask8) -1, + __B); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_pd (__m512d __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_pd (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__B, + _mm512_setzero_pd (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_pd(A, B) \ + (__builtin_ia32_vcvtph2pd512_mask_round ((A), \ + _mm512_setzero_pd (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_pd(A, B, C, D) \ + (__builtin_ia32_vcvtph2pd512_mask_round ((C), (A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_pd(A, B, C) \ + (__builtin_ia32_vcvtph2pd512_mask_round ((B), \ + _mm512_setzero_pd (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2psx. */ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtxph_ps (__m256h __A) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__A, + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtxph_ps (__m512 __A, __mmask16 __B, __m256h __C) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtxph_ps (__mmask16 __A, __m256h __B) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__B, + _mm512_setzero_ps (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtx_roundph_ps (__m256h __A, int __B) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__A, + _mm512_setzero_ps (), + (__mmask16) -1, + __B); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtx_roundph_ps (__m512 __A, __mmask16 __B, __m256h __C, int __D) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtx_roundph_ps (__mmask16 __A, __m256h __B, int __C) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__B, + _mm512_setzero_ps (), + __A, + __C); +} + +#else +#define _mm512_cvtx_roundph_ps(A, B) \ + (__builtin_ia32_vcvtph2psx512_mask_round ((A), \ + _mm512_setzero_ps (), \ + (__mmask16)-1, \ + (B))) + +#define _mm512_mask_cvtx_roundph_ps(A, B, C, D) \ + (__builtin_ia32_vcvtph2psx512_mask_round ((C), (A), (B), (D))) + +#define _mm512_maskz_cvtx_roundph_ps(A, B, C) \ + (__builtin_ia32_vcvtph2psx512_mask_round ((B), \ + _mm512_setzero_ps (), \ + (A), \ + (C))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtps2ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtxps_ph (__m512 __A) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtxps_ph (__m256h __A, __mmask16 __B, __m512 __C) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C, + __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtxps_ph (__mmask16 __A, __m512 __B) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B, + _mm256_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtx_roundps_ph (__m512 __A, int __B) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtx_roundps_ph (__m256h __A, __mmask16 __B, __m512 __C, int __D) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C, + __A, __B, __D); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtx_roundps_ph (__mmask16 __A, __m512 __B, int __C) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B, + _mm256_setzero_ph (), + __A, __C); +} + +#else +#define _mm512_cvtx_roundps_ph(A, B) \ + (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(A), \ + _mm256_setzero_ph (),\ + (__mmask16)-1, (B))) + +#define _mm512_mask_cvtx_roundps_ph(A, B, C, D) \ + (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(C), \ + (A), (B), (D))) + +#define _mm512_maskz_cvtx_roundps_ph(A, B, C) \ + (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(B), \ + _mm256_setzero_ph (),\ + (A), (C))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtpd2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_ph (__m512d __A) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m512d __C) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C, + __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_ph (__mmask8 __A, __m512d __B) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B, + _mm_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_ph (__m512d __A, int __B) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A, + _mm_setzero_ph (), + (__mmask8) -1, + __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_ph (__m128h __A, __mmask8 __B, __m512d __C, int __D) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C, + __A, __B, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_ph (__mmask8 __A, __m512d __B, int __C) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B, + _mm_setzero_ph (), + __A, __C); +} + +#else +#define _mm512_cvt_roundpd_ph(A, B) \ + (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(A), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (B))) + +#define _mm512_mask_cvt_roundpd_ph(A, B, C, D) \ + (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(C), \ + (A), (B), (D))) + +#define _mm512_maskz_cvt_roundpd_ph(A, B, C) \ + (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(B), \ + _mm_setzero_ph (), \ + (A), (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtsh2ss, vcvtsh2sd. */ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_ss (__m128 __A, __m128h __B) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A, + _mm_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsh_ss (__m128 __A, __mmask8 __B, __m128 __C, + __m128h __D) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsh_ss (__mmask8 __A, __m128 __B, + __m128h __C) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B, + _mm_setzero_ps (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_sd (__m128d __A, __m128h __B) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A, + _mm_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsh_sd (__m128d __A, __mmask8 __B, __m128d __C, + __m128h __D) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsh_sd (__mmask8 __A, __m128d __B, __m128h __C) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B, + _mm_setzero_pd (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_ss (__m128 __A, __m128h __B, const int __R) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A, + _mm_setzero_ps (), + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsh_ss (__m128 __A, __mmask8 __B, __m128 __C, + __m128h __D, const int __R) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsh_ss (__mmask8 __A, __m128 __B, + __m128h __C, const int __R) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B, + _mm_setzero_ps (), + __A, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_sd (__m128d __A, __m128h __B, const int __R) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A, + _mm_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsh_sd (__m128d __A, __mmask8 __B, __m128d __C, + __m128h __D, const int __R) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsh_sd (__mmask8 __A, __m128d __B, __m128h __C, const int __R) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B, + _mm_setzero_pd (), + __A, __R); +} + +#else +#define _mm_cvt_roundsh_ss(A, B, R) \ + (__builtin_ia32_vcvtsh2ss_mask_round ((B), (A), \ + _mm_setzero_ps (), \ + (__mmask8) -1, (R))) + +#define _mm_mask_cvt_roundsh_ss(A, B, C, D, R) \ + (__builtin_ia32_vcvtsh2ss_mask_round ((D), (C), (A), (B), (R))) + +#define _mm_maskz_cvt_roundsh_ss(A, B, C, R) \ + (__builtin_ia32_vcvtsh2ss_mask_round ((C), (B), \ + _mm_setzero_ps (), \ + (A), (R))) + +#define _mm_cvt_roundsh_sd(A, B, R) \ + (__builtin_ia32_vcvtsh2sd_mask_round ((B), (A), \ + _mm_setzero_pd (), \ + (__mmask8) -1, (R))) + +#define _mm_mask_cvt_roundsh_sd(A, B, C, D, R) \ + (__builtin_ia32_vcvtsh2sd_mask_round ((D), (C), (A), (B), (R))) + +#define _mm_maskz_cvt_roundsh_sd(A, B, C, R) \ + (__builtin_ia32_vcvtsh2sd_mask_round ((C), (B), \ + _mm_setzero_pd (), \ + (A), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtss2sh, vcvtsd2sh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_sh (__m128h __A, __m128 __B) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtss_sh (__mmask8 __A, __m128h __B, __m128 __C) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_sh (__m128h __A, __m128d __B) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsd_sh (__mmask8 __A, __m128h __B, __m128d __C) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_sh (__m128h __A, __m128 __B, const int __R) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D, + const int __R) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundss_sh (__mmask8 __A, __m128h __B, __m128 __C, + const int __R) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_sh (__m128h __A, __m128d __B, const int __R) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D, + const int __R) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsd_sh (__mmask8 __A, __m128h __B, __m128d __C, + const int __R) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, __R); +} + +#else +#define _mm_cvt_roundss_sh(A, B, R) \ + (__builtin_ia32_vcvtss2sh_mask_round ((B), (A), \ + _mm_setzero_ph (), \ + (__mmask8) -1, R)) + +#define _mm_mask_cvt_roundss_sh(A, B, C, D, R) \ + (__builtin_ia32_vcvtss2sh_mask_round ((D), (C), (A), (B), (R))) + +#define _mm_maskz_cvt_roundss_sh(A, B, C, R) \ + (__builtin_ia32_vcvtss2sh_mask_round ((C), (B), \ + _mm_setzero_ph (), \ + A, R)) + +#define _mm_cvt_roundsd_sh(A, B, R) \ + (__builtin_ia32_vcvtsd2sh_mask_round ((B), (A), \ + _mm_setzero_ph (), \ + (__mmask8) -1, R)) + +#define _mm_mask_cvt_roundsd_sh(A, B, C, D, R) \ + (__builtin_ia32_vcvtsd2sh_mask_round ((D), (C), (A), (B), (R))) + +#define _mm_maskz_cvt_roundsd_sh(A, B, C, R) \ + (__builtin_ia32_vcvtsd2sh_mask_round ((C), (B), \ + _mm_setzero_ph (), \ + (A), (R))) + +#endif /* __OPTIMIZE__ */ + #ifdef __DISABLE_AVX512FP16__ #undef __DISABLE_AVX512FP16__ #pragma GCC pop_options diff --git a/gcc/config/i386/avx512fp16vlintrin.h b/gcc/config/i386/avx512fp16vlintrin.h index 1787ed5..59906d2 100644 --- a/gcc/config/i386/avx512fp16vlintrin.h +++ b/gcc/config/i386/avx512fp16vlintrin.h @@ -34,6 +34,123 @@ #define __DISABLE_AVX512FP16VL__ #endif /* __AVX512FP16VL__ */ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castph_ps (__m128h __a) +{ + return (__m128) __a; +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castph_ps (__m256h __a) +{ + return (__m256) __a; +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castph_pd (__m128h __a) +{ + return (__m128d) __a; +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castph_pd (__m256h __a) +{ + return (__m256d) __a; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castph_si128 (__m128h __a) +{ + return (__m128i) __a; +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castph_si256 (__m256h __a) +{ + return (__m256i) __a; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_ph (__m128 __a) +{ + return (__m128h) __a; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps_ph (__m256 __a) +{ + return (__m256h) __a; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_ph (__m128d __a) +{ + return (__m128h) __a; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd_ph (__m256d __a) +{ + return (__m256h) __a; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_ph (__m128i __a) +{ + return (__m128h) __a; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_ph (__m256i __a) +{ + return (__m256h) __a; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castph256_ph128 (__m256h __A) +{ + union + { + __m128h a[2]; + __m256h v; + } u = { .v = __A }; + return u.a[0]; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castph128_ph256 (__m128h __A) +{ + union + { + __m128h a[2]; + __m256h v; + } u; + u.a[0] = __A; + return u.v; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zextph128_ph256 (__m128h __A) +{ + return (__m256h) _mm256_insertf128_ps (_mm256_setzero_ps (), + (__m128) __A, 0); +} + /* Intrinsics v[add,sub,mul,div]ph. */ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -53,30 +170,30 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vaddph_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_addph128_mask (__C, __D, __A, __B); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) { - return __builtin_ia32_vaddph_v16hf_mask (__C, __D, __A, __B); + return __builtin_ia32_addph256_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vaddph_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_addph128_mask (__B, __C, _mm_setzero_ph (), + __A); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C) { - return __builtin_ia32_vaddph_v16hf_mask (__B, __C, - _mm256_setzero_ph (), __A); + return __builtin_ia32_addph256_mask (__B, __C, + _mm256_setzero_ph (), __A); } extern __inline __m128h @@ -97,30 +214,30 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vsubph_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_subph128_mask (__C, __D, __A, __B); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) { - return __builtin_ia32_vsubph_v16hf_mask (__C, __D, __A, __B); + return __builtin_ia32_subph256_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vsubph_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_subph128_mask (__B, __C, _mm_setzero_ph (), + __A); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C) { - return __builtin_ia32_vsubph_v16hf_mask (__B, __C, - _mm256_setzero_ph (), __A); + return __builtin_ia32_subph256_mask (__B, __C, + _mm256_setzero_ph (), __A); } extern __inline __m128h @@ -141,30 +258,30 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vmulph_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_mulph128_mask (__C, __D, __A, __B); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) { - return __builtin_ia32_vmulph_v16hf_mask (__C, __D, __A, __B); + return __builtin_ia32_mulph256_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vmulph_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_mulph128_mask (__B, __C, _mm_setzero_ph (), + __A); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C) { - return __builtin_ia32_vmulph_v16hf_mask (__B, __C, - _mm256_setzero_ph (), __A); + return __builtin_ia32_mulph256_mask (__B, __C, + _mm256_setzero_ph (), __A); } extern __inline __m128h @@ -185,30 +302,30 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vdivph_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_divph128_mask (__C, __D, __A, __B); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) { - return __builtin_ia32_vdivph_v16hf_mask (__C, __D, __A, __B); + return __builtin_ia32_divph256_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vdivph_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_divph128_mask (__B, __C, _mm_setzero_ph (), + __A); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C) { - return __builtin_ia32_vdivph_v16hf_mask (__B, __C, - _mm256_setzero_ph (), __A); + return __builtin_ia32_divph256_mask (__B, __C, + _mm256_setzero_ph (), __A); } /* Intrinsics v[max,min]ph. */ @@ -216,96 +333,96 @@ extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_max_ph (__m128h __A, __m128h __B) { - return __builtin_ia32_vmaxph_v8hf_mask (__A, __B, - _mm_setzero_ph (), - (__mmask8) -1); + return __builtin_ia32_maxph128_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_max_ph (__m256h __A, __m256h __B) { - return __builtin_ia32_vmaxph_v16hf_mask (__A, __B, - _mm256_setzero_ph (), - (__mmask16) -1); + return __builtin_ia32_maxph256_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_max_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vmaxph_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_maxph128_mask (__C, __D, __A, __B); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_max_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) { - return __builtin_ia32_vmaxph_v16hf_mask (__C, __D, __A, __B); + return __builtin_ia32_maxph256_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_max_ph (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vmaxph_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_maxph128_mask (__B, __C, _mm_setzero_ph (), + __A); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_maskz_max_ph (__mmask16 __A, __m256h __B, __m256h __C) { - return __builtin_ia32_vmaxph_v16hf_mask (__B, __C, - _mm256_setzero_ph (), __A); + return __builtin_ia32_maxph256_mask (__B, __C, + _mm256_setzero_ph (), __A); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_ph (__m128h __A, __m128h __B) { - return __builtin_ia32_vminph_v8hf_mask (__A, __B, - _mm_setzero_ph (), - (__mmask8) -1); + return __builtin_ia32_minph128_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_min_ph (__m256h __A, __m256h __B) { - return __builtin_ia32_vminph_v16hf_mask (__A, __B, - _mm256_setzero_ph (), - (__mmask16) -1); + return __builtin_ia32_minph256_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_min_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) { - return __builtin_ia32_vminph_v8hf_mask (__C, __D, __A, __B); + return __builtin_ia32_minph128_mask (__C, __D, __A, __B); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_min_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) { - return __builtin_ia32_vminph_v16hf_mask (__C, __D, __A, __B); + return __builtin_ia32_minph256_mask (__C, __D, __A, __B); } extern __inline __m128h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_min_ph (__mmask8 __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vminph_v8hf_mask (__B, __C, _mm_setzero_ph (), - __A); + return __builtin_ia32_minph128_mask (__B, __C, _mm_setzero_ph (), + __A); } extern __inline __m256h __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C) { - return __builtin_ia32_vminph_v16hf_mask (__B, __C, - _mm256_setzero_ph (), __A); + return __builtin_ia32_minph256_mask (__B, __C, + _mm256_setzero_ph (), __A); } /* vcmpph */ @@ -314,8 +431,8 @@ extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmp_ph_mask (__m128h __A, __m128h __B, const int __C) { - return (__mmask8) __builtin_ia32_vcmpph_v8hf_mask (__A, __B, __C, - (__mmask8) -1); + return (__mmask8) __builtin_ia32_cmpph128_mask (__A, __B, __C, + (__mmask8) -1); } extern __inline __mmask8 @@ -323,15 +440,15 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_cmp_ph_mask (__mmask8 __A, __m128h __B, __m128h __C, const int __D) { - return (__mmask8) __builtin_ia32_vcmpph_v8hf_mask (__B, __C, __D, __A); + return (__mmask8) __builtin_ia32_cmpph128_mask (__B, __C, __D, __A); } extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmp_ph_mask (__m256h __A, __m256h __B, const int __C) { - return (__mmask16) __builtin_ia32_vcmpph_v16hf_mask (__A, __B, __C, - (__mmask16) -1); + return (__mmask16) __builtin_ia32_cmpph256_mask (__A, __B, __C, + (__mmask16) -1); } extern __inline __mmask16 @@ -339,25 +456,1819 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_cmp_ph_mask (__mmask16 __A, __m256h __B, __m256h __C, const int __D) { - return (__mmask16) __builtin_ia32_vcmpph_v16hf_mask (__B, __C, __D, - __A); + return (__mmask16) __builtin_ia32_cmpph256_mask (__B, __C, __D, + __A); } #else -#define _mm_cmp_ph_mask(A, B, C) \ - (__builtin_ia32_vcmpph_v8hf_mask ((A), (B), (C), (-1))) +#define _mm_cmp_ph_mask(A, B, C) \ + (__builtin_ia32_cmpph128_mask ((A), (B), (C), (-1))) -#define _mm_mask_cmp_ph_mask(A, B, C, D) \ - (__builtin_ia32_vcmpph_v8hf_mask ((B), (C), (D), (A))) +#define _mm_mask_cmp_ph_mask(A, B, C, D) \ + (__builtin_ia32_cmpph128_mask ((B), (C), (D), (A))) -#define _mm256_cmp_ph_mask(A, B, C) \ - (__builtin_ia32_vcmpph_v16hf_mask ((A), (B), (C), (-1))) +#define _mm256_cmp_ph_mask(A, B, C) \ + (__builtin_ia32_cmpph256_mask ((A), (B), (C), (-1))) -#define _mm256_mask_cmp_ph_mask(A, B, C, D) \ - (__builtin_ia32_vcmpph_v16hf_mask ((B), (C), (D), (A))) +#define _mm256_mask_cmp_ph_mask(A, B, C, D) \ + (__builtin_ia32_cmpph256_mask ((B), (C), (D), (A))) #endif /* __OPTIMIZE__ */ +/* Intrinsics vsqrtph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_ph (__m128h __A) +{ + return __builtin_ia32_sqrtph128_mask (__A, _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sqrt_ph (__m256h __A) +{ + return __builtin_ia32_sqrtph256_mask (__A, _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_ph (__m128h __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_sqrtph128_mask (__C, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sqrt_ph (__m256h __A, __mmask16 __B, __m256h __C) +{ + return __builtin_ia32_sqrtph256_mask (__C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_ph (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_sqrtph128_mask (__B, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sqrt_ph (__mmask16 __A, __m256h __B) +{ + return __builtin_ia32_sqrtph256_mask (__B, _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vrsqrtph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_ph (__m128h __A) +{ + return __builtin_ia32_rsqrtph128_mask (__A, _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rsqrt_ph (__m256h __A) +{ + return __builtin_ia32_rsqrtph256_mask (__A, _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt_ph (__m128h __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_rsqrtph128_mask (__C, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rsqrt_ph (__m256h __A, __mmask16 __B, __m256h __C) +{ + return __builtin_ia32_rsqrtph256_mask (__C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt_ph (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_rsqrtph128_mask (__B, _mm_setzero_ph (), __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rsqrt_ph (__mmask16 __A, __m256h __B) +{ + return __builtin_ia32_rsqrtph256_mask (__B, _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vrcpph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_ph (__m128h __A) +{ + return __builtin_ia32_rcpph128_mask (__A, _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rcp_ph (__m256h __A) +{ + return __builtin_ia32_rcpph256_mask (__A, _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp_ph (__m128h __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_rcpph128_mask (__C, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rcp_ph (__m256h __A, __mmask16 __B, __m256h __C) +{ + return __builtin_ia32_rcpph256_mask (__C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp_ph (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_rcpph128_mask (__B, _mm_setzero_ph (), __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rcp_ph (__mmask16 __A, __m256h __B) +{ + return __builtin_ia32_rcpph256_mask (__B, _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vscalefph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_ph (__m128h __A, __m128h __B) +{ + return __builtin_ia32_scalefph128_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_scalef_ph (__m256h __A, __m256h __B) +{ + return __builtin_ia32_scalefph256_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_scalefph128_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_scalef_ph (__m256h __A, __mmask16 __B, __m256h __C, + __m256h __D) +{ + return __builtin_ia32_scalefph256_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_scalefph128_mask (__B, __C, + _mm_setzero_ph (), __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_scalef_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_scalefph256_mask (__B, __C, + _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vreduceph. */ +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_ph (__m128h __A, int __B) +{ + return __builtin_ia32_reduceph128_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_reduceph128_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_ph (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_reduceph128_mask (__B, __C, + _mm_setzero_ph (), __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_ph (__m256h __A, int __B) +{ + return __builtin_ia32_reduceph256_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_ph (__m256h __A, __mmask16 __B, __m256h __C, int __D) +{ + return __builtin_ia32_reduceph256_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_reduce_ph (__mmask16 __A, __m256h __B, int __C) +{ + return __builtin_ia32_reduceph256_mask (__B, __C, + _mm256_setzero_ph (), + __A); +} + +#else +#define _mm_reduce_ph(A, B) \ + (__builtin_ia32_reduceph128_mask ((A), (B), \ + _mm_setzero_ph (), \ + ((__mmask8)-1))) + +#define _mm_mask_reduce_ph(A, B, C, D) \ + (__builtin_ia32_reduceph128_mask ((C), (D), (A), (B))) + +#define _mm_maskz_reduce_ph(A, B, C) \ + (__builtin_ia32_reduceph128_mask ((B), (C), _mm_setzero_ph (), (A))) + +#define _mm256_reduce_ph(A, B) \ + (__builtin_ia32_reduceph256_mask ((A), (B), \ + _mm256_setzero_ph (), \ + ((__mmask16)-1))) + +#define _mm256_mask_reduce_ph(A, B, C, D) \ + (__builtin_ia32_reduceph256_mask ((C), (D), (A), (B))) + +#define _mm256_maskz_reduce_ph(A, B, C) \ + (__builtin_ia32_reduceph256_mask ((B), (C), _mm256_setzero_ph (), (A))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vrndscaleph. */ +#ifdef __OPTIMIZE__ + extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roundscale_ph (__m128h __A, int __B) + { + return __builtin_ia32_rndscaleph128_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); + } + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_rndscaleph128_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_ph (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_rndscaleph128_mask (__B, __C, + _mm_setzero_ph (), __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_roundscale_ph (__m256h __A, int __B) +{ + return __builtin_ia32_rndscaleph256_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_roundscale_ph (__m256h __A, __mmask16 __B, __m256h __C, + int __D) +{ + return __builtin_ia32_rndscaleph256_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_roundscale_ph (__mmask16 __A, __m256h __B, int __C) +{ + return __builtin_ia32_rndscaleph256_mask (__B, __C, + _mm256_setzero_ph (), + __A); +} + +#else +#define _mm_roundscale_ph(A, B) \ + (__builtin_ia32_rndscaleph128_mask ((A), (B), _mm_setzero_ph (), \ + ((__mmask8)-1))) + +#define _mm_mask_roundscale_ph(A, B, C, D) \ + (__builtin_ia32_rndscaleph128_mask ((C), (D), (A), (B))) + +#define _mm_maskz_roundscale_ph(A, B, C) \ + (__builtin_ia32_rndscaleph128_mask ((B), (C), _mm_setzero_ph (), (A))) + +#define _mm256_roundscale_ph(A, B) \ + (__builtin_ia32_rndscaleph256_mask ((A), (B), \ + _mm256_setzero_ph(), \ + ((__mmask16)-1))) + +#define _mm256_mask_roundscale_ph(A, B, C, D) \ + (__builtin_ia32_rndscaleph256_mask ((C), (D), (A), (B))) + +#define _mm256_maskz_roundscale_ph(A, B, C) \ + (__builtin_ia32_rndscaleph256_mask ((B), (C), \ + _mm256_setzero_ph (), (A))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfpclassph. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fpclass_ph_mask (__mmask8 __U, __m128h __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A, + __imm, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_ph_mask (__m128h __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A, + __imm, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fpclass_ph_mask (__mmask16 __U, __m256h __A, const int __imm) +{ + return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A, + __imm, __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fpclass_ph_mask (__m256h __A, const int __imm) +{ + return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A, + __imm, + (__mmask16) -1); +} + +#else +#define _mm_fpclass_ph_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X), \ + (int) (C),(__mmask8)-1)) + +#define _mm_mask_fpclass_ph_mask(u, X, C) \ + ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X), \ + (int) (C),(__mmask8)(u))) + +#define _mm256_fpclass_ph_mask(X, C) \ + ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \ + (int) (C),(__mmask16)-1)) + +#define _mm256_mask_fpclass_ph_mask(u, X, C) \ + ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \ + (int) (C),(__mmask16)(u))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vgetexpph, vgetexpsh. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getexp_ph (__m256h __A) +{ + return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A, + (__v16hf) + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getexp_ph (__m256h __W, __mmask16 __U, __m256h __A) +{ + return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A, + (__v16hf) __W, + (__mmask16) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getexp_ph (__mmask16 __U, __m256h __A) +{ + return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A, + (__v16hf) + _mm256_setzero_ph (), + (__mmask16) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_ph (__m128h __A) +{ + return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A, + (__v8hf) + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_ph (__m128h __W, __mmask8 __U, __m128h __A) +{ + return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A, + (__v8hf) __W, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_ph (__mmask8 __U, __m128h __A) +{ + return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A, + (__v8hf) + _mm_setzero_ph (), + (__mmask8) __U); +} + + +/* Intrinsics vgetmantph, vgetmantsh. */ +#ifdef __OPTIMIZE__ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getmant_ph (__m256h __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A, + (__C << 2) | __B, + (__v16hf) + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getmant_ph (__m256h __W, __mmask16 __U, __m256h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A, + (__C << 2) | __B, + (__v16hf) __W, + (__mmask16) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getmant_ph (__mmask16 __U, __m256h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A, + (__C << 2) | __B, + (__v16hf) + _mm256_setzero_ph (), + (__mmask16) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_ph (__m128h __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A, + (__C << 2) | __B, + (__v8hf) + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_ph (__m128h __W, __mmask8 __U, __m128h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A, + (__C << 2) | __B, + (__v8hf) __W, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_ph (__mmask8 __U, __m128h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A, + (__C << 2) | __B, + (__v8hf) + _mm_setzero_ph (), + (__mmask8) __U); +} + +#else +#define _mm256_getmant_ph(X, B, C) \ + ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v16hf)(__m256h)_mm256_setzero_ph (), \ + (__mmask16)-1)) + +#define _mm256_mask_getmant_ph(W, U, X, B, C) \ + ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v16hf)(__m256h)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_getmant_ph(U, X, B, C) \ + ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v16hf)(__m256h)_mm256_setzero_ph (), \ + (__mmask16)(U))) + +#define _mm_getmant_ph(X, B, C) \ + ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v8hf)(__m128h)_mm_setzero_ph (), \ + (__mmask8)-1)) + +#define _mm_mask_getmant_ph(W, U, X, B, C) \ + ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v8hf)(__m128h)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_getmant_ph(U, X, B, C) \ + ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v8hf)(__m128h)_mm_setzero_ph (), \ + (__mmask8)(U))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2dq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epi32 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvtph2dq128_mask (__A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epi32 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvtph2dq128_mask (__C, ( __v4si) __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvtph2dq128_mask (__B, + (__v4si) _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epi32 (__m128h __A) +{ + return (__m256i) + __builtin_ia32_vcvtph2dq256_mask (__A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epi32 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return (__m256i) + __builtin_ia32_vcvtph2dq256_mask (__C, ( __v8si) __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B) +{ + return (__m256i) + __builtin_ia32_vcvtph2dq256_mask (__B, + (__v8si) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtph2udq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epu32 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvtph2udq128_mask (__A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epu32 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvtph2udq128_mask (__C, ( __v4si) __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvtph2udq128_mask (__B, + (__v4si) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epu32 (__m128h __A) +{ + return (__m256i) + __builtin_ia32_vcvtph2udq256_mask (__A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epu32 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return (__m256i) + __builtin_ia32_vcvtph2udq256_mask (__C, ( __v8si) __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B) +{ + return (__m256i) + __builtin_ia32_vcvtph2udq256_mask (__B, + (__v8si) _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2dq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epi32 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvttph2dq128_mask (__A, + (__v4si) _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epi32 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i)__builtin_ia32_vcvttph2dq128_mask (__C, + ( __v4si) __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvttph2dq128_mask (__B, + (__v4si) _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epi32 (__m128h __A) +{ + return (__m256i) + __builtin_ia32_vcvttph2dq256_mask (__A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epi32 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return (__m256i) + __builtin_ia32_vcvttph2dq256_mask (__C, + ( __v8si) __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B) +{ + return (__m256i) + __builtin_ia32_vcvttph2dq256_mask (__B, + (__v8si) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2udq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epu32 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvttph2udq128_mask (__A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epu32 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvttph2udq128_mask (__C, + ( __v4si) __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvttph2udq128_mask (__B, + (__v4si) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epu32 (__m128h __A) +{ + return (__m256i) + __builtin_ia32_vcvttph2udq256_mask (__A, + (__v8si) + _mm256_setzero_si256 (), (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epu32 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return (__m256i) + __builtin_ia32_vcvttph2udq256_mask (__C, + ( __v8si) __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B) +{ + return (__m256i) + __builtin_ia32_vcvttph2udq256_mask (__B, + (__v8si) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtdq2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_ph (__m128i __A) +{ + return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_ph (__m256i __A) +{ + return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m256i __C) +{ + return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_ph (__mmask8 __A, __m256i __B) +{ + return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vcvtudq2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_ph (__m128i __A) +{ + return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __C, + __A, + __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu32_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu32_ph (__m256i __A) +{ + return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m256i __C) +{ + return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu32_ph (__mmask8 __A, __m256i __B) +{ + return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vcvtph2qq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epi64 (__m128h __A) +{ + return + __builtin_ia32_vcvtph2qq128_mask (__A, + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epi64 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2qq128_mask (__C, __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2qq128_mask (__B, + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epi64 (__m128h __A) +{ + return __builtin_ia32_vcvtph2qq256_mask (__A, + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epi64 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2qq256_mask (__C, __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2qq256_mask (__B, + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtph2uqq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvtph2uqq128_mask (__A, + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epu64 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2uqq128_mask (__C, __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2uqq128_mask (__B, + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvtph2uqq256_mask (__A, + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epu64 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2uqq256_mask (__C, __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2uqq256_mask (__B, + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2qq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epi64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2qq128_mask (__A, + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epi64 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2qq128_mask (__C, + __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2qq128_mask (__B, + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epi64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2qq256_mask (__A, + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epi64 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2qq256_mask (__C, + __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2qq256_mask (__B, + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2uqq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2uqq128_mask (__A, + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epu64 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2uqq128_mask (__C, + __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2uqq128_mask (__B, + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2uqq256_mask (__A, + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epu64 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2uqq256_mask (__C, + __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2uqq256_mask (__B, + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtqq2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_ph (__m128i __A) +{ + return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_ph (__m256i __A) +{ + return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m256i __C) +{ + return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_ph (__mmask8 __A, __m256i __B) +{ + return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vcvtuqq2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu64_ph (__m128i __A) +{ + return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu64_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu64_ph (__m256i __A) +{ + return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m256i __C) +{ + return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu64_ph (__mmask8 __A, __m256i __B) +{ + return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vcvtph2w. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epi16 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvtph2w128_mask (__A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epi16 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvtph2w128_mask (__C, ( __v8hi) __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epi16 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvtph2w128_mask (__B, + (__v8hi) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epi16 (__m256h __A) +{ + return (__m256i) + __builtin_ia32_vcvtph2w256_mask (__A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epi16 (__m256i __A, __mmask16 __B, __m256h __C) +{ + return (__m256i) + __builtin_ia32_vcvtph2w256_mask (__C, ( __v16hi) __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epi16 (__mmask16 __A, __m256h __B) +{ + return (__m256i) + __builtin_ia32_vcvtph2w256_mask (__B, + (__v16hi) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtph2uw. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epu16 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvtph2uw128_mask (__A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epu16 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvtph2uw128_mask (__C, ( __v8hi) __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epu16 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvtph2uw128_mask (__B, + (__v8hi) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epu16 (__m256h __A) +{ + return (__m256i) + __builtin_ia32_vcvtph2uw256_mask (__A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epu16 (__m256i __A, __mmask16 __B, __m256h __C) +{ + return (__m256i) + __builtin_ia32_vcvtph2uw256_mask (__C, ( __v16hi) __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epu16 (__mmask16 __A, __m256h __B) +{ + return (__m256i) + __builtin_ia32_vcvtph2uw256_mask (__B, + (__v16hi) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2w. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epi16 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvttph2w128_mask (__A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epi16 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvttph2w128_mask (__C, + ( __v8hi) __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epi16 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvttph2w128_mask (__B, + (__v8hi) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epi16 (__m256h __A) +{ + return (__m256i) + __builtin_ia32_vcvttph2w256_mask (__A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epi16 (__m256i __A, __mmask16 __B, __m256h __C) +{ + return (__m256i) + __builtin_ia32_vcvttph2w256_mask (__C, + ( __v16hi) __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epi16 (__mmask16 __A, __m256h __B) +{ + return (__m256i) + __builtin_ia32_vcvttph2w256_mask (__B, + (__v16hi) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2uw. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epu16 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvttph2uw128_mask (__A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epu16 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvttph2uw128_mask (__C, + ( __v8hi) __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epu16 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvttph2uw128_mask (__B, + (__v8hi) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epu16 (__m256h __A) +{ + return (__m256i) + __builtin_ia32_vcvttph2uw256_mask (__A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epu16 (__m256i __A, __mmask16 __B, __m256h __C) +{ + return (__m256i) + __builtin_ia32_vcvttph2uw256_mask (__C, + ( __v16hi) __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epu16 (__mmask16 __A, __m256h __B) +{ + return (__m256i) + __builtin_ia32_vcvttph2uw256_mask (__B, + (__v16hi) _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtw2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_ph (__m128i __A) +{ + return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi16_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __C, + __A, + __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi16_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_ph (__m256i __A) +{ + return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __A, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi16_ph (__m256h __A, __mmask16 __B, __m256i __C) +{ + return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __C, + __A, + __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi16_ph (__mmask16 __A, __m256i __B) +{ + return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __B, + _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vcvtuw2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_ph (__m128i __A) +{ + return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu16_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu16_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu16_ph (__m256i __A) +{ + return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __A, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu16_ph (__m256h __A, __mmask16 __B, __m256i __C) +{ + return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __C, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu16_ph (__mmask16 __A, __m256i __B) +{ + return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __B, + _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vcvtph2pd. */ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_pd (__m128h __A) +{ + return __builtin_ia32_vcvtph2pd128_mask (__A, + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_pd (__m128d __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2pd128_mask (__C, __A, __B); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_pd (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2pd128_mask (__B, _mm_setzero_pd (), __A); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_pd (__m128h __A) +{ + return __builtin_ia32_vcvtph2pd256_mask (__A, + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_pd (__m256d __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2pd256_mask (__C, __A, __B); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_pd (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2pd256_mask (__B, + _mm256_setzero_pd (), + __A); +} + +/* Intrinsics vcvtph2ps. */ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtxph_ps (__m128h __A) +{ + return __builtin_ia32_vcvtph2psx128_mask (__A, + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtxph_ps (__m128 __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2psx128_mask (__C, __A, __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtxph_ps (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2psx128_mask (__B, _mm_setzero_ps (), __A); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtxph_ps (__m128h __A) +{ + return __builtin_ia32_vcvtph2psx256_mask (__A, + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtxph_ps (__m256 __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2psx256_mask (__C, __A, __B); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtxph_ps (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2psx256_mask (__B, + _mm256_setzero_ps (), + __A); +} + +/* Intrinsics vcvtxps2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtxps_ph (__m128 __A) +{ + return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m128 __C) +{ + return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtxps_ph (__mmask8 __A, __m128 __B) +{ + return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtxps_ph (__m256 __A) +{ + return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m256 __C) +{ + return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtxps_ph (__mmask8 __A, __m256 __B) +{ + return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vcvtpd2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_ph (__m128d __A) +{ + return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m128d __C) +{ + return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_ph (__mmask8 __A, __m128d __B) +{ + return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_ph (__m256d __A) +{ + return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m256d __C) +{ + return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_ph (__mmask8 __A, __m256d __B) +{ + return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __B, + _mm_setzero_ph (), + __A); +} + #ifdef __DISABLE_AVX512FP16VL__ #undef __DISABLE_AVX512FP16VL__ #pragma GCC pop_options diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index d11c02b..7fd4286 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -134,6 +134,7 @@ DEF_POINTER_TYPE (PCVOID, VOID, CONST) DEF_POINTER_TYPE (PVOID, VOID) DEF_POINTER_TYPE (PDOUBLE, DOUBLE) DEF_POINTER_TYPE (PFLOAT, FLOAT) +DEF_POINTER_TYPE (PCFLOAT16, FLOAT16, CONST) DEF_POINTER_TYPE (PSHORT, SHORT) DEF_POINTER_TYPE (PUSHORT, USHORT) DEF_POINTER_TYPE (PINT, INT) @@ -1304,17 +1305,72 @@ DEF_FUNCTION_TYPE (UINT8, PV2DI, PCV2DI, PCVOID) # FP16 builtins DEF_FUNCTION_TYPE (V8HF, V8HI) +DEF_FUNCTION_TYPE (QI, V8HF, INT, UQI) +DEF_FUNCTION_TYPE (HI, V16HF, INT, UHI) +DEF_FUNCTION_TYPE (SI, V32HF, INT, USI) +DEF_FUNCTION_TYPE (INT, V8HF, INT) +DEF_FUNCTION_TYPE (INT64, V8HF, INT) +DEF_FUNCTION_TYPE (UINT, V8HF, INT) +DEF_FUNCTION_TYPE (UINT64, V8HF, INT) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF) +DEF_FUNCTION_TYPE (VOID, PCFLOAT16, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, PCFLOAT16, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V8HF, INT, INT) +DEF_FUNCTION_TYPE (V8HF, V8HF, INT64, INT) +DEF_FUNCTION_TYPE (V8HF, V8HF, UINT, INT) +DEF_FUNCTION_TYPE (V8HF, V8HF, UINT64, INT) +DEF_FUNCTION_TYPE (V2DI, V8HF, V2DI, UQI) +DEF_FUNCTION_TYPE (V4DI, V8HF, V4DI, UQI) +DEF_FUNCTION_TYPE (V2DF, V8HF, V2DF, UQI) +DEF_FUNCTION_TYPE (V4DF, V8HF, V4DF, UQI) +DEF_FUNCTION_TYPE (V4SI, V8HF, V4SI, UQI) +DEF_FUNCTION_TYPE (V4SF, V8HF, V4SF, UQI) +DEF_FUNCTION_TYPE (V8SI, V8HF, V8SI, UQI) +DEF_FUNCTION_TYPE (V8SF, V8HF, V8SF, UQI) +DEF_FUNCTION_TYPE (V8HI, V8HF, V8HI, UQI) +DEF_FUNCTION_TYPE (V8HF, V4SI, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V4SF, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V8SI, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V8SF, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V2DI, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V4DI, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V2DF, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V4DF, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V8HI, V8HF, UQI) +DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, UQI) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT) +DEF_FUNCTION_TYPE (V8HF, V8HF, INT, V8HF, UQI) DEF_FUNCTION_TYPE (UQI, V8HF, V8HF, INT, UQI) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI) DEF_FUNCTION_TYPE (UQI, V8HF, V8HF, INT, UQI, INT) +DEF_FUNCTION_TYPE (V8DI, V8HF, V8DI, UQI, INT) +DEF_FUNCTION_TYPE (V8DF, V8HF, V8DF, UQI, INT) +DEF_FUNCTION_TYPE (V8HF, V8DI, V8HF, UQI, INT) +DEF_FUNCTION_TYPE (V8HF, V8DF, V8HF, UQI, INT) DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, V8HF, UQI, INT) +DEF_FUNCTION_TYPE (V8HF, V2DF, V8HF, V8HF, UQI, INT) +DEF_FUNCTION_TYPE (V8HF, V4SF, V8HF, V8HF, UQI, INT) +DEF_FUNCTION_TYPE (V2DF, V8HF, V2DF, V2DF, UQI, INT) +DEF_FUNCTION_TYPE (V4SF, V8HF, V4SF, V4SF, UQI, INT) +DEF_FUNCTION_TYPE (V8HF, V8HF, V8HF, INT, V8HF, UQI, INT) DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF) +DEF_FUNCTION_TYPE (V16HI, V16HF, V16HI, UHI) +DEF_FUNCTION_TYPE (V16HF, V16HI, V16HF, UHI) +DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, UHI) +DEF_FUNCTION_TYPE (V16SI, V16HF, V16SI, UHI, INT) +DEF_FUNCTION_TYPE (V16SF, V16HF, V16SF, UHI, INT) +DEF_FUNCTION_TYPE (V16HF, V16HF, INT, V16HF, UHI) DEF_FUNCTION_TYPE (UHI, V16HF, V16HF, INT, UHI) +DEF_FUNCTION_TYPE (V16HF, V16SI, V16HF, UHI, INT) +DEF_FUNCTION_TYPE (V16HF, V16SF, V16HF, UHI, INT) DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, V16HF, UHI) +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT) +DEF_FUNCTION_TYPE (V32HI, V32HF, V32HI, USI, INT) +DEF_FUNCTION_TYPE (V32HF, V32HI, V32HF, USI, INT) DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI) +DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, USI, INT) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI) DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT) +DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index c9d80cb..dc56dc2 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -393,6 +393,10 @@ BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mas BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI) BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI) +/* AVX512FP16 */ +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_loadhf_mask, "__builtin_ia32_loadsh_mask", IX86_BUILTIN_LOADSH_MASK, UNKNOWN, (int) V8HF_FTYPE_PCFLOAT16_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_storehf_mask, "__builtin_ia32_storesh_mask", IX86_BUILTIN_STORESH_MASK, UNKNOWN, (int) VOID_FTYPE_PCFLOAT16_V8HF_UQI) + /* RDPKRU and WRPKRU. */ BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_rdpkru, "__builtin_ia32_rdpkru", IX86_BUILTIN_RDPKRU, UNKNOWN, (int) UNSIGNED_FTYPE_VOID) BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_wrpkru, "__builtin_ia32_wrpkru", IX86_BUILTIN_WRPKRU, UNKNOWN, (int) VOID_FTYPE_UNSIGNED) @@ -2775,33 +2779,102 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__b BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) /* AVX512FP16. */ -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_vaddph_v8hf_mask", IX86_BUILTIN_VADDPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_vaddph_v16hf_mask", IX86_BUILTIN_VADDPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_vaddph_v32hf_mask", IX86_BUILTIN_VADDPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_vsubph_v8hf_mask", IX86_BUILTIN_VSUBPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_vsubph_v16hf_mask", IX86_BUILTIN_VSUBPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_vsubph_v32hf_mask", IX86_BUILTIN_VSUBPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_vmulph_v8hf_mask", IX86_BUILTIN_VMULPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_vmulph_v16hf_mask", IX86_BUILTIN_VMULPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_vmulph_v32hf_mask", IX86_BUILTIN_VMULPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_vdivph_v8hf_mask", IX86_BUILTIN_VDIVPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_vdivph_v16hf_mask", IX86_BUILTIN_VDIVPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_vdivph_v32hf_mask", IX86_BUILTIN_VDIVPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask, "__builtin_ia32_vaddsh_v8hf_mask", IX86_BUILTIN_VADDSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask, "__builtin_ia32_vsubsh_v8hf_mask", IX86_BUILTIN_VSUBSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask, "__builtin_ia32_vmulsh_v8hf_mask", IX86_BUILTIN_VMULSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask, "__builtin_ia32_vdivsh_v8hf_mask", IX86_BUILTIN_VDIVSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv8hf3_mask, "__builtin_ia32_vmaxph_v8hf_mask", IX86_BUILTIN_VMAXPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv16hf3_mask, "__builtin_ia32_vmaxph_v16hf_mask", IX86_BUILTIN_VMAXPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask, "__builtin_ia32_vmaxph_v32hf_mask", IX86_BUILTIN_VMAXPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv8hf3_mask, "__builtin_ia32_vminph_v8hf_mask", IX86_BUILTIN_VMINPH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv16hf3_mask, "__builtin_ia32_vminph_v16hf_mask", IX86_BUILTIN_VMINPH_V16HF_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask, "__builtin_ia32_vminph_v32hf_mask", IX86_BUILTIN_VMINPH_V32HF_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask, "__builtin_ia32_vmaxsh_v8hf_mask", IX86_BUILTIN_VMAXSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask, "__builtin_ia32_vminsh_v8hf_mask", IX86_BUILTIN_VMINSH_V8HF_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_cmpv8hf3_mask, "__builtin_ia32_vcmpph_v8hf_mask", IX86_BUILTIN_VCMPPH_V8HF_MASK, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI) -BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_cmpv16hf3_mask, "__builtin_ia32_vcmpph_v16hf_mask", IX86_BUILTIN_VCMPPH_V16HF_MASK, UNKNOWN, (int) UHI_FTYPE_V16HF_V16HF_INT_UHI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask, "__builtin_ia32_vcmpph_v32hf_mask", IX86_BUILTIN_VCMPPH_V32HF_MASK, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_addph256_mask", IX86_BUILTIN_ADDPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_addph512_mask", IX86_BUILTIN_ADDPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_subph128_mask", IX86_BUILTIN_SUBPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_subph256_mask", IX86_BUILTIN_SUBPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_subph512_mask", IX86_BUILTIN_SUBPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_mulph128_mask", IX86_BUILTIN_MULPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_mulph256_mask", IX86_BUILTIN_MULPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask, "__builtin_ia32_mulph512_mask", IX86_BUILTIN_MULPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv8hf3_mask, "__builtin_ia32_divph128_mask", IX86_BUILTIN_DIVPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv16hf3_mask, "__builtin_ia32_divph256_mask", IX86_BUILTIN_DIVPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask, "__builtin_ia32_divph512_mask", IX86_BUILTIN_DIVPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask, "__builtin_ia32_addsh_mask", IX86_BUILTIN_ADDSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask, "__builtin_ia32_subsh_mask", IX86_BUILTIN_SUBSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask, "__builtin_ia32_mulsh_mask", IX86_BUILTIN_MULSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask, "__builtin_ia32_divsh_mask", IX86_BUILTIN_DIVSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv8hf3_mask, "__builtin_ia32_maxph128_mask", IX86_BUILTIN_MAXPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv16hf3_mask, "__builtin_ia32_maxph256_mask", IX86_BUILTIN_MAXPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask, "__builtin_ia32_maxph512_mask", IX86_BUILTIN_MAXPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv8hf3_mask, "__builtin_ia32_minph128_mask", IX86_BUILTIN_MINPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv16hf3_mask, "__builtin_ia32_minph256_mask", IX86_BUILTIN_MINPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask, "__builtin_ia32_minph512_mask", IX86_BUILTIN_MINPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask, "__builtin_ia32_maxsh_mask", IX86_BUILTIN_MAXSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask, "__builtin_ia32_minsh_mask", IX86_BUILTIN_MINSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_cmpv8hf3_mask, "__builtin_ia32_cmpph128_mask", IX86_BUILTIN_CMPPH128_MASK, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_cmpv16hf3_mask, "__builtin_ia32_cmpph256_mask", IX86_BUILTIN_CMPPH256_MASK, UNKNOWN, (int) UHI_FTYPE_V16HF_V16HF_INT_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask, "__builtin_ia32_cmpph512_mask", IX86_BUILTIN_CMPPH512_MASK, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_sqrtv8hf2_mask, "__builtin_ia32_sqrtph128_mask", IX86_BUILTIN_SQRTPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_sqrtv16hf2_mask, "__builtin_ia32_sqrtph256_mask", IX86_BUILTIN_SQRTPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rsqrtv8hf2_mask, "__builtin_ia32_rsqrtph128_mask", IX86_BUILTIN_RSQRTPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rsqrtv16hf2_mask, "__builtin_ia32_rsqrtph256_mask", IX86_BUILTIN_RSQRTPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rsqrtv32hf2_mask, "__builtin_ia32_rsqrtph512_mask", IX86_BUILTIN_RSQRTPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmrsqrtv8hf2_mask, "__builtin_ia32_rsqrtsh_mask", IX86_BUILTIN_RSQRTSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rcpv8hf2_mask, "__builtin_ia32_rcpph128_mask", IX86_BUILTIN_RCPPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rcpv16hf2_mask, "__builtin_ia32_rcpph256_mask", IX86_BUILTIN_RCPPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rcpv32hf2_mask, "__builtin_ia32_rcpph512_mask", IX86_BUILTIN_RCPPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmrcpv8hf2_mask, "__builtin_ia32_rcpsh_mask", IX86_BUILTIN_RCPSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_scalefv8hf_mask, "__builtin_ia32_scalefph128_mask", IX86_BUILTIN_SCALEFPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_scalefv16hf_mask, "__builtin_ia32_scalefph256_mask", IX86_BUILTIN_SCALEFPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducepv8hf_mask, "__builtin_ia32_reduceph128_mask", IX86_BUILTIN_REDUCEPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducepv16hf_mask, "__builtin_ia32_reduceph256_mask", IX86_BUILTIN_REDUCEPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf_mask, "__builtin_ia32_rndscaleph128_mask", IX86_BUILTIN_RNDSCALEPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf_mask, "__builtin_ia32_rndscaleph256_mask", IX86_BUILTIN_RNDSCALEPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_fpclassv16hf_mask, "__builtin_ia32_fpclassph256_mask", IX86_BUILTIN_FPCLASSPH256, UNKNOWN, (int) HI_FTYPE_V16HF_INT_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_fpclassv8hf_mask, "__builtin_ia32_fpclassph128_mask", IX86_BUILTIN_FPCLASSPH128, UNKNOWN, (int) QI_FTYPE_V8HF_INT_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_fpclassv32hf_mask, "__builtin_ia32_fpclassph512_mask", IX86_BUILTIN_FPCLASSPH512, UNKNOWN, (int) SI_FTYPE_V32HF_INT_USI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512dq_vmfpclassv8hf_mask, "__builtin_ia32_fpclasssh_mask", IX86_BUILTIN_FPCLASSSH_MASK, UNKNOWN, (int) QI_FTYPE_V8HF_INT_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_getexpv16hf_mask, "__builtin_ia32_getexpph256_mask", IX86_BUILTIN_GETEXPPH256, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_getexpv8hf_mask, "__builtin_ia32_getexpph128_mask", IX86_BUILTIN_GETEXPPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_getmantv16hf_mask, "__builtin_ia32_getmantph256_mask", IX86_BUILTIN_GETMANTPH256, UNKNOWN, (int) V16HF_FTYPE_V16HF_INT_V16HF_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_getmantv8hf_mask, "__builtin_ia32_getmantph128_mask", IX86_BUILTIN_GETMANTPH128, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_V8HF_UQI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_movhf_mask, "__builtin_ia32_vmovsh_mask", IX86_BUILTIN_VMOVSH_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2dq_v4si_mask, "__builtin_ia32_vcvtph2dq128_mask", IX86_BUILTIN_VCVTPH2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2dq_v8si_mask, "__builtin_ia32_vcvtph2dq256_mask", IX86_BUILTIN_VCVTPH2DQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2udq_v4si_mask, "__builtin_ia32_vcvtph2udq128_mask", IX86_BUILTIN_VCVTPH2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2udq_v8si_mask, "__builtin_ia32_vcvtph2udq256_mask", IX86_BUILTIN_VCVTPH2UDQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv4si2_mask, "__builtin_ia32_vcvttph2dq128_mask", IX86_BUILTIN_VCVTTPH2DQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv8si2_mask, "__builtin_ia32_vcvttph2dq256_mask", IX86_BUILTIN_VCVTTPH2DQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv4si2_mask, "__builtin_ia32_vcvttph2udq128_mask", IX86_BUILTIN_VCVTTPH2UDQ128_MASK, UNKNOWN, (int) V4SI_FTYPE_V8HF_V4SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv8si2_mask, "__builtin_ia32_vcvttph2udq256_mask", IX86_BUILTIN_VCVTTPH2UDQ256_MASK, UNKNOWN, (int) V8SI_FTYPE_V8HF_V8SI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2qq_v2di_mask, "__builtin_ia32_vcvtph2qq128_mask", IX86_BUILTIN_VCVTPH2QQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2qq_v4di_mask, "__builtin_ia32_vcvtph2qq256_mask", IX86_BUILTIN_VCVTPH2QQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uqq_v2di_mask, "__builtin_ia32_vcvtph2uqq128_mask", IX86_BUILTIN_VCVTPH2UQQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uqq_v4di_mask, "__builtin_ia32_vcvtph2uqq256_mask", IX86_BUILTIN_VCVTPH2UQQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv2di2_mask, "__builtin_ia32_vcvttph2qq128_mask", IX86_BUILTIN_VCVTTPH2QQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv4di2_mask, "__builtin_ia32_vcvttph2qq256_mask", IX86_BUILTIN_VCVTTPH2QQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv2di2_mask, "__builtin_ia32_vcvttph2uqq128_mask", IX86_BUILTIN_VCVTTPH2UQQ128_MASK, UNKNOWN, (int) V2DI_FTYPE_V8HF_V2DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv4di2_mask, "__builtin_ia32_vcvttph2uqq256_mask", IX86_BUILTIN_VCVTTPH2UQQ256_MASK, UNKNOWN, (int) V4DI_FTYPE_V8HF_V4DI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2w_v8hi_mask, "__builtin_ia32_vcvtph2w128_mask", IX86_BUILTIN_VCVTPH2W128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2w_v16hi_mask, "__builtin_ia32_vcvtph2w256_mask", IX86_BUILTIN_VCVTPH2W256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uw_v8hi_mask, "__builtin_ia32_vcvtph2uw128_mask", IX86_BUILTIN_VCVTPH2UW128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uw_v16hi_mask, "__builtin_ia32_vcvtph2uw256_mask", IX86_BUILTIN_VCVTPH2UW256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv8hi2_mask, "__builtin_ia32_vcvttph2w128_mask", IX86_BUILTIN_VCVTTPH2W128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv16hi2_mask, "__builtin_ia32_vcvttph2w256_mask", IX86_BUILTIN_VCVTTPH2W256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv8hi2_mask, "__builtin_ia32_vcvttph2uw128_mask", IX86_BUILTIN_VCVTTPH2UW128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HF_V8HI_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv16hi2_mask, "__builtin_ia32_vcvttph2uw256_mask", IX86_BUILTIN_VCVTTPH2UW256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HF_V16HI_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtw2ph_v8hi_mask, "__builtin_ia32_vcvtw2ph128_mask", IX86_BUILTIN_VCVTW2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HI_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtw2ph_v16hi_mask, "__builtin_ia32_vcvtw2ph256_mask", IX86_BUILTIN_VCVTW2PH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HI_V16HF_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuw2ph_v8hi_mask, "__builtin_ia32_vcvtuw2ph128_mask", IX86_BUILTIN_VCVTUW2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HI_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuw2ph_v16hi_mask, "__builtin_ia32_vcvtuw2ph256_mask", IX86_BUILTIN_VCVTUW2PH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HI_V16HF_UHI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtdq2ph_v4si_mask, "__builtin_ia32_vcvtdq2ph128_mask", IX86_BUILTIN_VCVTDQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V4SI_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtdq2ph_v8si_mask, "__builtin_ia32_vcvtdq2ph256_mask", IX86_BUILTIN_VCVTDQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V8SI_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtudq2ph_v4si_mask, "__builtin_ia32_vcvtudq2ph128_mask", IX86_BUILTIN_VCVTUDQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V4SI_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtudq2ph_v8si_mask, "__builtin_ia32_vcvtudq2ph256_mask", IX86_BUILTIN_VCVTUDQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V8SI_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtqq2ph_v2di_mask, "__builtin_ia32_vcvtqq2ph128_mask", IX86_BUILTIN_VCVTQQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V2DI_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtqq2ph_v4di_mask, "__builtin_ia32_vcvtqq2ph256_mask", IX86_BUILTIN_VCVTQQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V4DI_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuqq2ph_v2di_mask, "__builtin_ia32_vcvtuqq2ph128_mask", IX86_BUILTIN_VCVTUQQ2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V2DI_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuqq2ph_v4di_mask, "__builtin_ia32_vcvtuqq2ph256_mask", IX86_BUILTIN_VCVTUQQ2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V4DI_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv2df2_mask, "__builtin_ia32_vcvtph2pd128_mask", IX86_BUILTIN_VCVTPH2PD128_MASK, UNKNOWN, (int) V2DF_FTYPE_V8HF_V2DF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv4df2_mask, "__builtin_ia32_vcvtph2pd256_mask", IX86_BUILTIN_VCVTPH2PD256_MASK, UNKNOWN, (int) V4DF_FTYPE_V8HF_V4DF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv4sf2_mask, "__builtin_ia32_vcvtph2psx128_mask", IX86_BUILTIN_VCVTPH2PSX128_MASK, UNKNOWN, (int) V4SF_FTYPE_V8HF_V4SF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv8sf2_mask, "__builtin_ia32_vcvtph2psx256_mask", IX86_BUILTIN_VCVTPH2PSX256_MASK, UNKNOWN, (int) V8SF_FTYPE_V8HF_V8SF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtps2ph_v4sf_mask, "__builtin_ia32_vcvtps2phx128_mask", IX86_BUILTIN_VCVTPS2PHX128_MASK, UNKNOWN, (int) V8HF_FTYPE_V4SF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtps2ph_v8sf_mask, "__builtin_ia32_vcvtps2phx256_mask", IX86_BUILTIN_VCVTPS2PHX256_MASK, UNKNOWN, (int) V8HF_FTYPE_V8SF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtpd2ph_v2df_mask, "__builtin_ia32_vcvtpd2ph128_mask", IX86_BUILTIN_VCVTPD2PH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V2DF_V8HF_UQI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtpd2ph_v4df_mask, "__builtin_ia32_vcvtpd2ph256_mask", IX86_BUILTIN_VCVTPD2PH256_MASK, UNKNOWN, (int) V8HF_FTYPE_V4DF_V8HF_UQI) /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) @@ -3003,20 +3076,70 @@ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv16sf_mask_round, "_ BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangepv8df_mask_round, "__builtin_ia32_rangepd512_mask", IX86_BUILTIN_RANGEPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT) /* AVX512FP16. */ -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_vaddph_v32hf_mask_round", IX86_BUILTIN_VADDPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_vsubph_v32hf_mask_round", IX86_BUILTIN_VSUBPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_vmulph_v32hf_mask_round", IX86_BUILTIN_VMULPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_vdivph_v32hf_mask_round", IX86_BUILTIN_VDIVPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask_round, "__builtin_ia32_vaddsh_v8hf_mask_round", IX86_BUILTIN_VADDSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask_round, "__builtin_ia32_vsubsh_v8hf_mask_round", IX86_BUILTIN_VSUBSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask_round, "__builtin_ia32_vmulsh_v8hf_mask_round", IX86_BUILTIN_VMULSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask_round, "__builtin_ia32_vdivsh_v8hf_mask_round", IX86_BUILTIN_VDIVSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask_round, "__builtin_ia32_vmaxph_v32hf_mask_round", IX86_BUILTIN_VMAXPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask_round, "__builtin_ia32_vminph_v32hf_mask_round", IX86_BUILTIN_VMINPH_V32HF_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask_round, "__builtin_ia32_vmaxsh_v8hf_mask_round", IX86_BUILTIN_VMAXSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask_round, "__builtin_ia32_vminsh_v8hf_mask_round", IX86_BUILTIN_VMINSH_V8HF_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask_round, "__builtin_ia32_vcmpph_v32hf_mask_round", IX86_BUILTIN_VCMPPH_V32HF_MASK_ROUND, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI_INT) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmcmpv8hf3_mask_round, "__builtin_ia32_vcmpsh_v8hf_mask_round", IX86_BUILTIN_VCMPSH_V8HF_MASK_ROUND, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask_round, "__builtin_ia32_addph512_mask_round", IX86_BUILTIN_ADDPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask_round, "__builtin_ia32_subph512_mask_round", IX86_BUILTIN_SUBPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv32hf3_mask_round, "__builtin_ia32_mulph512_mask_round", IX86_BUILTIN_MULPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_divv32hf3_mask_round, "__builtin_ia32_divph512_mask_round", IX86_BUILTIN_DIVPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmaddv8hf3_mask_round, "__builtin_ia32_addsh_mask_round", IX86_BUILTIN_ADDSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsubv8hf3_mask_round, "__builtin_ia32_subsh_mask_round", IX86_BUILTIN_SUBSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmmulv8hf3_mask_round, "__builtin_ia32_mulsh_mask_round", IX86_BUILTIN_MULSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmdivv8hf3_mask_round, "__builtin_ia32_divsh_mask_round", IX86_BUILTIN_DIVSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_smaxv32hf3_mask_round, "__builtin_ia32_maxph512_mask_round", IX86_BUILTIN_MAXPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_sminv32hf3_mask_round, "__builtin_ia32_minph512_mask_round", IX86_BUILTIN_MINPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsmaxv8hf3_mask_round, "__builtin_ia32_maxsh_mask_round", IX86_BUILTIN_MAXSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsminv8hf3_mask_round, "__builtin_ia32_minsh_mask_round", IX86_BUILTIN_MINSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_cmpv32hf3_mask_round, "__builtin_ia32_cmpph512_mask_round", IX86_BUILTIN_CMPPH512_MASK_ROUND, UNKNOWN, (int) USI_FTYPE_V32HF_V32HF_INT_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmcmpv8hf3_mask_round, "__builtin_ia32_cmpsh_mask_round", IX86_BUILTIN_CMPSH_MASK_ROUND, UNKNOWN, (int) UQI_FTYPE_V8HF_V8HF_INT_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_sqrtv32hf2_mask_round, "__builtin_ia32_sqrtph512_mask_round", IX86_BUILTIN_SQRTPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vmsqrtv8hf2_mask_round, "__builtin_ia32_sqrtsh_mask_round", IX86_BUILTIN_SQRTSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_scalefv32hf_mask_round, "__builtin_ia32_scalefph512_mask_round", IX86_BUILTIN_SCALEFPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vmscalefv8hf_mask_round, "__builtin_ia32_scalefsh_mask_round", IX86_BUILTIN_SCALEFSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducepv32hf_mask_round, "__builtin_ia32_reduceph512_mask_round", IX86_BUILTIN_REDUCEPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_INT_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_reducesv8hf_mask_round, "__builtin_ia32_reducesh_mask_round", IX86_BUILTIN_REDUCESH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf_mask_round, "__builtin_ia32_rndscaleph512_mask_round", IX86_BUILTIN_RNDSCALEPH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HF_INT_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_rndscalev8hf_mask_round, "__builtin_ia32_rndscalesh_mask_round", IX86_BUILTIN_RNDSCALESH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_getexpv32hf_mask_round, "__builtin_ia32_getexpph512_mask", IX86_BUILTIN_GETEXPPH512, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_sgetexpv8hf_mask_round, "__builtin_ia32_getexpsh_mask_round", IX86_BUILTIN_GETEXPSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_getmantv32hf_mask_round, "__builtin_ia32_getmantph512_mask", IX86_BUILTIN_GETMANTPH512, UNKNOWN, (int) V32HF_FTYPE_V32HF_INT_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_vgetmantv8hf_mask_round, "__builtin_ia32_getmantsh_mask_round", IX86_BUILTIN_GETMANTSH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2dq_v16si_mask_round, "__builtin_ia32_vcvtph2dq512_mask_round", IX86_BUILTIN_VCVTPH2DQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2udq_v16si_mask_round, "__builtin_ia32_vcvtph2udq512_mask_round", IX86_BUILTIN_VCVTPH2UDQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv16si2_mask_round, "__builtin_ia32_vcvttph2dq512_mask_round", IX86_BUILTIN_VCVTTPH2DQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv16si2_mask_round, "__builtin_ia32_vcvttph2udq512_mask_round", IX86_BUILTIN_VCVTTPH2UDQ512_MASK_ROUND, UNKNOWN, (int) V16SI_FTYPE_V16HF_V16SI_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2qq_v8di_mask_round, "__builtin_ia32_vcvtph2qq512_mask_round", IX86_BUILTIN_VCVTPH2QQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uqq_v8di_mask_round, "__builtin_ia32_vcvtph2uqq512_mask_round", IX86_BUILTIN_VCVTPH2UQQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv8di2_mask_round, "__builtin_ia32_vcvttph2qq512_mask_round", IX86_BUILTIN_VCVTTPH2QQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv8di2_mask_round, "__builtin_ia32_vcvttph2uqq512_mask_round", IX86_BUILTIN_VCVTTPH2UQQ512_MASK_ROUND, UNKNOWN, (int) V8DI_FTYPE_V8HF_V8DI_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2w_v32hi_mask_round, "__builtin_ia32_vcvtph2w512_mask_round", IX86_BUILTIN_VCVTPH2W512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtph2uw_v32hi_mask_round, "__builtin_ia32_vcvtph2uw512_mask_round", IX86_BUILTIN_VCVTPH2UW512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncv32hi2_mask_round, "__builtin_ia32_vcvttph2w512_mask_round", IX86_BUILTIN_VCVTTPH2W512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncv32hi2_mask_round, "__builtin_ia32_vcvttph2uw512_mask_round", IX86_BUILTIN_VCVTTPH2UW512_MASK_ROUND, UNKNOWN, (int) V32HI_FTYPE_V32HF_V32HI_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtw2ph_v32hi_mask_round, "__builtin_ia32_vcvtw2ph512_mask_round", IX86_BUILTIN_VCVTW2PH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HI_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuw2ph_v32hi_mask_round, "__builtin_ia32_vcvtuw2ph512_mask_round", IX86_BUILTIN_VCVTUW2PH512_MASK_ROUND, UNKNOWN, (int) V32HF_FTYPE_V32HI_V32HF_USI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtdq2ph_v16si_mask_round, "__builtin_ia32_vcvtdq2ph512_mask_round", IX86_BUILTIN_VCVTDQ2PH512_MASK_ROUND, UNKNOWN, (int) V16HF_FTYPE_V16SI_V16HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtudq2ph_v16si_mask_round, "__builtin_ia32_vcvtudq2ph512_mask_round", IX86_BUILTIN_VCVTUDQ2PH512_MASK_ROUND, UNKNOWN, (int) V16HF_FTYPE_V16SI_V16HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtqq2ph_v8di_mask_round, "__builtin_ia32_vcvtqq2ph512_mask_round", IX86_BUILTIN_VCVTQQ2PH512_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8DI_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtuqq2ph_v8di_mask_round, "__builtin_ia32_vcvtuqq2ph512_mask_round", IX86_BUILTIN_VCVTUQQ2PH512_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8DI_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2si_round, "__builtin_ia32_vcvtsh2si32_round", IX86_BUILTIN_VCVTSH2SI32_ROUND, UNKNOWN, (int) INT_FTYPE_V8HF_INT) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2siq_round, "__builtin_ia32_vcvtsh2si64_round", IX86_BUILTIN_VCVTSH2SI64_ROUND, UNKNOWN, (int) INT64_FTYPE_V8HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2usi_round, "__builtin_ia32_vcvtsh2usi32_round", IX86_BUILTIN_VCVTSH2USI32_ROUND, UNKNOWN, (int) UINT_FTYPE_V8HF_INT) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2usiq_round, "__builtin_ia32_vcvtsh2usi64_round", IX86_BUILTIN_VCVTSH2USI64_ROUND, UNKNOWN, (int) UINT64_FTYPE_V8HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncsi2_round, "__builtin_ia32_vcvttsh2si32_round", IX86_BUILTIN_VCVTTSH2SI32_ROUND, UNKNOWN, (int) INT_FTYPE_V8HF_INT) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fix_truncdi2_round, "__builtin_ia32_vcvttsh2si64_round", IX86_BUILTIN_VCVTTSH2SI64_ROUND, UNKNOWN, (int) INT64_FTYPE_V8HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncsi2_round, "__builtin_ia32_vcvttsh2usi32_round", IX86_BUILTIN_VCVTTSH2USI32_ROUND, UNKNOWN, (int) UINT_FTYPE_V8HF_INT) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_fixuns_truncdi2_round, "__builtin_ia32_vcvttsh2usi64_round", IX86_BUILTIN_VCVTTSH2USI64_ROUND, UNKNOWN, (int) UINT64_FTYPE_V8HF_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsi2sh_round, "__builtin_ia32_vcvtsi2sh32_round", IX86_BUILTIN_VCVTSI2SH32_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT_INT) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsi2shq_round, "__builtin_ia32_vcvtsi2sh64_round", IX86_BUILTIN_VCVTSI2SH64_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_INT64_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtusi2sh_round, "__builtin_ia32_vcvtusi2sh32_round", IX86_BUILTIN_VCVTUSI2SH32_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_UINT_INT) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtusi2shq_round, "__builtin_ia32_vcvtusi2sh64_round", IX86_BUILTIN_VCVTUSI2SH64_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8HF_UINT64_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv8df2_mask_round, "__builtin_ia32_vcvtph2pd512_mask_round", IX86_BUILTIN_VCVTPH2PD512_MASK_ROUND, UNKNOWN, (int) V8DF_FTYPE_V8HF_V8DF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_float_extend_phv16sf2_mask_round, "__builtin_ia32_vcvtph2psx512_mask_round", IX86_BUILTIN_VCVTPH2PSX512_MASK_ROUND, UNKNOWN, (int) V16SF_FTYPE_V16HF_V16SF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtpd2ph_v8df_mask_round, "__builtin_ia32_vcvtpd2ph512_mask_round", IX86_BUILTIN_VCVTPD2PH512_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V8DF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtps2ph_v16sf_mask_round, "__builtin_ia32_vcvtps2phx512_mask_round", IX86_BUILTIN_VCVTPS2PHX512_MASK_ROUND, UNKNOWN, (int) V16HF_FTYPE_V16SF_V16HF_UHI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2ss_mask_round, "__builtin_ia32_vcvtsh2ss_mask_round", IX86_BUILTIN_VCVTSH2SS_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsh2sd_mask_round, "__builtin_ia32_vcvtsh2sd_mask_round", IX86_BUILTIN_VCVTSH2SD_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtss2sh_mask_round, "__builtin_ia32_vcvtss2sh_mask_round", IX86_BUILTIN_VCVTSS2SH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_vcvtsd2sh_mask_round, "__builtin_ia32_vcvtsd2sh_mask_round", IX86_BUILTIN_VCVTSD2SH_MASK_ROUND, UNKNOWN, (int) V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT) BDESC_END (ROUND_ARGS, MULTI_ARG) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index e117afb..bfafd15 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -9710,6 +9710,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V16HI_FTYPE_V16SI_V16HI_UHI: case V16QI_FTYPE_V16SI_V16QI_UHI: case V16QI_FTYPE_V8DI_V16QI_UQI: + case V32HF_FTYPE_V32HF_V32HF_USI: case V16SF_FTYPE_V16SF_V16SF_UHI: case V16SF_FTYPE_V4SF_V16SF_UHI: case V16SI_FTYPE_SI_V16SI_UHI: @@ -9739,20 +9740,40 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V16HI_FTYPE_HI_V16HI_UHI: case V8HI_FTYPE_V8HI_V8HI_UQI: case V8HI_FTYPE_HI_V8HI_UQI: + case V16HF_FTYPE_V16HF_V16HF_UHI: case V8SF_FTYPE_V8HI_V8SF_UQI: case V4SF_FTYPE_V8HI_V4SF_UQI: + case V8SI_FTYPE_V8HF_V8SI_UQI: + case V8SF_FTYPE_V8HF_V8SF_UQI: case V8SI_FTYPE_V8SF_V8SI_UQI: case V4SI_FTYPE_V4SF_V4SI_UQI: + case V4SI_FTYPE_V8HF_V4SI_UQI: + case V4SF_FTYPE_V8HF_V4SF_UQI: + case V4DI_FTYPE_V8HF_V4DI_UQI: case V4DI_FTYPE_V4SF_V4DI_UQI: + case V2DI_FTYPE_V8HF_V2DI_UQI: case V2DI_FTYPE_V4SF_V2DI_UQI: + case V8HF_FTYPE_V8HF_V8HF_UQI: + case V8HF_FTYPE_V8HI_V8HF_UQI: + case V8HF_FTYPE_V8SI_V8HF_UQI: + case V8HF_FTYPE_V8SF_V8HF_UQI: + case V8HF_FTYPE_V4SI_V8HF_UQI: + case V8HF_FTYPE_V4SF_V8HF_UQI: + case V8HF_FTYPE_V4DI_V8HF_UQI: + case V8HF_FTYPE_V4DF_V8HF_UQI: + case V8HF_FTYPE_V2DI_V8HF_UQI: + case V8HF_FTYPE_V2DF_V8HF_UQI: case V4SF_FTYPE_V4DI_V4SF_UQI: case V4SF_FTYPE_V2DI_V4SF_UQI: case V4DF_FTYPE_V4DI_V4DF_UQI: + case V4DF_FTYPE_V8HF_V4DF_UQI: + case V2DF_FTYPE_V8HF_V2DF_UQI: case V2DF_FTYPE_V2DI_V2DF_UQI: case V16QI_FTYPE_V8HI_V16QI_UQI: case V16QI_FTYPE_V16HI_V16QI_UHI: case V16QI_FTYPE_V4SI_V16QI_UQI: case V16QI_FTYPE_V8SI_V16QI_UQI: + case V8HI_FTYPE_V8HF_V8HI_UQI: case V8HI_FTYPE_V4SI_V8HI_UQI: case V8HI_FTYPE_V8SI_V8HI_UQI: case V16QI_FTYPE_V2DI_V16QI_UQI: @@ -9810,6 +9831,8 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V8DI_FTYPE_DI_V8DI_UQI: case V16SF_FTYPE_V8SF_V16SF_UHI: case V16SI_FTYPE_V8SI_V16SI_UHI: + case V16HF_FTYPE_V16HI_V16HF_UHI: + case V16HI_FTYPE_V16HF_V16HI_UHI: case V16HI_FTYPE_V16HI_V16HI_UHI: case V8HI_FTYPE_V16QI_V8HI_UQI: case V16HI_FTYPE_V16QI_V16HI_UHI: @@ -9910,6 +9933,9 @@ ix86_expand_args_builtin (const struct builtin_description *d, case HI_FTYPE_V16SF_INT_UHI: case QI_FTYPE_V8SF_INT_UQI: case QI_FTYPE_V4SF_INT_UQI: + case QI_FTYPE_V8HF_INT_UQI: + case HI_FTYPE_V16HF_INT_UHI: + case SI_FTYPE_V32HF_INT_USI: case V4SI_FTYPE_V4SI_V4SI_UHI: case V8SI_FTYPE_V8SI_V8SI_UHI: nargs = 3; @@ -10058,6 +10084,8 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V16SF_FTYPE_V16SF_INT_V16SF_UHI: case V16HI_FTYPE_V16SF_INT_V16HI_UHI: case V16SI_FTYPE_V16SI_INT_V16SI_UHI: + case V16HF_FTYPE_V16HF_INT_V16HF_UHI: + case V8HF_FTYPE_V8HF_INT_V8HF_UQI: case V4SI_FTYPE_V16SI_INT_V4SI_UQI: case V4DI_FTYPE_V8DI_INT_V4DI_UQI: case V4DF_FTYPE_V8DF_INT_V4DF_UQI: @@ -10229,8 +10257,10 @@ ix86_expand_args_builtin (const struct builtin_description *d, case CODE_FOR_avx_vpermilv4df_mask: case CODE_FOR_avx512f_getmantv8df_mask: case CODE_FOR_avx512f_getmantv16sf_mask: + case CODE_FOR_avx512vl_getmantv16hf_mask: case CODE_FOR_avx512vl_getmantv8sf_mask: case CODE_FOR_avx512vl_getmantv4df_mask: + case CODE_FOR_avx512fp16_getmantv8hf_mask: case CODE_FOR_avx512vl_getmantv4sf_mask: case CODE_FOR_avx512vl_getmantv2df_mask: case CODE_FOR_avx512dq_rangepv8df_mask_round: @@ -10645,16 +10675,24 @@ ix86_expand_round_builtin (const struct builtin_description *d, { case UINT64_FTYPE_V2DF_INT: case UINT64_FTYPE_V4SF_INT: + case UINT64_FTYPE_V8HF_INT: case UINT_FTYPE_V2DF_INT: case UINT_FTYPE_V4SF_INT: + case UINT_FTYPE_V8HF_INT: case INT64_FTYPE_V2DF_INT: case INT64_FTYPE_V4SF_INT: + case INT64_FTYPE_V8HF_INT: case INT_FTYPE_V2DF_INT: case INT_FTYPE_V4SF_INT: + case INT_FTYPE_V8HF_INT: nargs = 2; break; case V32HF_FTYPE_V32HF_V32HF_INT: case V8HF_FTYPE_V8HF_V8HF_INT: + case V8HF_FTYPE_V8HF_INT_INT: + case V8HF_FTYPE_V8HF_UINT_INT: + case V8HF_FTYPE_V8HF_INT64_INT: + case V8HF_FTYPE_V8HF_UINT64_INT: case V4SF_FTYPE_V4SF_UINT_INT: case V4SF_FTYPE_V4SF_UINT64_INT: case V2DF_FTYPE_V2DF_UINT64_INT: @@ -10669,18 +10707,29 @@ ix86_expand_round_builtin (const struct builtin_description *d, break; case V8SF_FTYPE_V8DF_V8SF_QI_INT: case V8DF_FTYPE_V8DF_V8DF_QI_INT: + case V32HI_FTYPE_V32HF_V32HI_USI_INT: case V8SI_FTYPE_V8DF_V8SI_QI_INT: + case V8DI_FTYPE_V8HF_V8DI_UQI_INT: case V8DI_FTYPE_V8DF_V8DI_QI_INT: case V8SF_FTYPE_V8DI_V8SF_QI_INT: case V8DF_FTYPE_V8DI_V8DF_QI_INT: + case V8DF_FTYPE_V8HF_V8DF_UQI_INT: + case V16SF_FTYPE_V16HF_V16SF_UHI_INT: + case V32HF_FTYPE_V32HI_V32HF_USI_INT: + case V32HF_FTYPE_V32HF_V32HF_USI_INT: case V16SF_FTYPE_V16SF_V16SF_HI_INT: case V8DI_FTYPE_V8SF_V8DI_QI_INT: case V16SF_FTYPE_V16SI_V16SF_HI_INT: case V16SI_FTYPE_V16SF_V16SI_HI_INT: + case V16SI_FTYPE_V16HF_V16SI_UHI_INT: + case V16HF_FTYPE_V16SI_V16HF_UHI_INT: case V8DF_FTYPE_V8SF_V8DF_QI_INT: case V16SF_FTYPE_V16HI_V16SF_HI_INT: case V2DF_FTYPE_V2DF_V2DF_V2DF_INT: case V4SF_FTYPE_V4SF_V4SF_V4SF_INT: + case V8HF_FTYPE_V8DI_V8HF_UQI_INT: + case V8HF_FTYPE_V8DF_V8HF_UQI_INT: + case V16HF_FTYPE_V16SF_V16HF_UHI_INT: nargs = 4; break; case V4SF_FTYPE_V4SF_V4SF_INT_INT: @@ -10694,8 +10743,10 @@ ix86_expand_round_builtin (const struct builtin_description *d, case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT: case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT: case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT: + case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT: case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT: + case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT: case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT: @@ -10703,8 +10754,11 @@ ix86_expand_round_builtin (const struct builtin_description *d, case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT: case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT: case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT: + case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT: + case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT: nargs = 5; break; + case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT: case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT: case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT: case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT: @@ -10727,6 +10781,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT: case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT: case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT: + case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT: nargs = 6; nargs_constant = 4; break; @@ -10763,10 +10818,12 @@ ix86_expand_round_builtin (const struct builtin_description *d, { case CODE_FOR_avx512f_getmantv8df_mask_round: case CODE_FOR_avx512f_getmantv16sf_mask_round: + case CODE_FOR_avx512bw_getmantv32hf_mask_round: case CODE_FOR_avx512f_vgetmantv2df_round: case CODE_FOR_avx512f_vgetmantv2df_mask_round: case CODE_FOR_avx512f_vgetmantv4sf_round: case CODE_FOR_avx512f_vgetmantv4sf_mask_round: + case CODE_FOR_avx512f_vgetmantv8hf_mask_round: error ("the immediate argument must be a 4-bit immediate"); return const0_rtx; case CODE_FOR_avx512f_cmpv8df3_mask_round: @@ -11070,6 +11127,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case VOID_FTYPE_PFLOAT_V16SF_UHI: case VOID_FTYPE_PFLOAT_V8SF_UQI: case VOID_FTYPE_PFLOAT_V4SF_UQI: + case VOID_FTYPE_PCFLOAT16_V8HF_UQI: case VOID_FTYPE_PV32QI_V32HI_USI: case VOID_FTYPE_PV16QI_V16HI_UHI: case VOID_FTYPE_PUDI_V8HI_UQI: @@ -11142,6 +11200,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case V16SF_FTYPE_PCFLOAT_V16SF_UHI: case V8SF_FTYPE_PCFLOAT_V8SF_UQI: case V4SF_FTYPE_PCFLOAT_V4SF_UQI: + case V8HF_FTYPE_PCFLOAT16_V8HF_UQI: nargs = 3; klass = load; memory = 0; @@ -14054,7 +14113,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, tmp1 = gen_reg_rtx (SImode); emit_move_insn (tmp1, gen_lowpart (SImode, val)); - /* Insert the SImode value as low element of a V4SImode vector. */ + /* Insert the SImode value as low element of a V4SImode vector. */ tmp2 = gen_reg_rtx (V4SImode); emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1)); emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2)); @@ -14179,6 +14238,8 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, break; case E_V8HImode: use_vector_set = TARGET_SSE2; + gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0 + ? gen_vec_setv8hi_0 : NULL; break; case E_V8QImode: use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1; @@ -14190,8 +14251,12 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, use_vector_set = TARGET_SSE4_1; break; case E_V32QImode: + use_vector_set = TARGET_AVX; + break; case E_V16HImode: use_vector_set = TARGET_AVX; + gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0 + ? gen_vec_setv16hi_0 : NULL; break; case E_V8SImode: use_vector_set = TARGET_AVX; @@ -14239,6 +14304,9 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, use_vector_set = TARGET_AVX512FP16 && one_var == 0; gen_vec_set_0 = gen_vec_setv32hf_0; break; + case E_V32HImode: + use_vector_set = TARGET_AVX512FP16 && one_var == 0; + gen_vec_set_0 = gen_vec_setv32hi_0; default: break; } @@ -14638,7 +14706,7 @@ ix86_expand_vector_init_interleave (machine_mode mode, switch (mode) { case E_V8HFmode: - gen_load_even = gen_vec_setv8hf; + gen_load_even = gen_vec_interleave_lowv8hf; gen_interleave_first_low = gen_vec_interleave_lowv4si; gen_interleave_second_low = gen_vec_interleave_lowv2di; inner_mode = HFmode; @@ -14673,35 +14741,40 @@ ix86_expand_vector_init_interleave (machine_mode mode, op = ops [i + i]; if (inner_mode == HFmode) { - /* Convert HFmode to HImode. */ - op1 = gen_reg_rtx (HImode); - op1 = gen_rtx_SUBREG (HImode, force_reg (HFmode, op), 0); - op = gen_reg_rtx (HImode); - emit_move_insn (op, op1); + rtx even, odd; + /* Use vpuncklwd to pack 2 HFmode. */ + op0 = gen_reg_rtx (V8HFmode); + even = lowpart_subreg (V8HFmode, force_reg (HFmode, op), HFmode); + odd = lowpart_subreg (V8HFmode, + force_reg (HFmode, ops[i + i + 1]), + HFmode); + emit_insn (gen_load_even (op0, even, odd)); } + else + { + /* Extend the odd elment to SImode using a paradoxical SUBREG. */ + op0 = gen_reg_rtx (SImode); + emit_move_insn (op0, gen_lowpart (SImode, op)); - /* Extend the odd elment to SImode using a paradoxical SUBREG. */ - op0 = gen_reg_rtx (SImode); - emit_move_insn (op0, gen_lowpart (SImode, op)); - - /* Insert the SImode value as low element of V4SImode vector. */ - op1 = gen_reg_rtx (V4SImode); - op0 = gen_rtx_VEC_MERGE (V4SImode, - gen_rtx_VEC_DUPLICATE (V4SImode, - op0), - CONST0_RTX (V4SImode), - const1_rtx); - emit_insn (gen_rtx_SET (op1, op0)); + /* Insert the SImode value as low element of V4SImode vector. */ + op1 = gen_reg_rtx (V4SImode); + op0 = gen_rtx_VEC_MERGE (V4SImode, + gen_rtx_VEC_DUPLICATE (V4SImode, + op0), + CONST0_RTX (V4SImode), + const1_rtx); + emit_insn (gen_rtx_SET (op1, op0)); - /* Cast the V4SImode vector back to a vector in orignal mode. */ - op0 = gen_reg_rtx (mode); - emit_move_insn (op0, gen_lowpart (mode, op1)); + /* Cast the V4SImode vector back to a vector in orignal mode. */ + op0 = gen_reg_rtx (mode); + emit_move_insn (op0, gen_lowpart (mode, op1)); - /* Load even elements into the second position. */ - emit_insn (gen_load_even (op0, - force_reg (inner_mode, - ops [i + i + 1]), - const1_rtx)); + /* Load even elements into the second position. */ + emit_insn (gen_load_even (op0, + force_reg (inner_mode, + ops[i + i + 1]), + const1_rtx)); + } /* Cast vector to FIRST_IMODE vector. */ ops[i] = gen_reg_rtx (first_imode); @@ -15182,6 +15255,7 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) machine_mode inner_mode = GET_MODE_INNER (mode); machine_mode half_mode; bool use_vec_merge = false; + bool blendm_const = false; rtx tmp; static rtx (*gen_extract[7][2]) (rtx, rtx) = { @@ -15369,7 +15443,14 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) return; case E_V8HFmode: - use_vec_merge = true; + if (TARGET_AVX2) + { + mmode = SImode; + gen_blendm = gen_sse4_1_pblendph; + blendm_const = true; + } + else + use_vec_merge = true; break; case E_V8HImode: @@ -15396,10 +15477,20 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) goto half; case E_V16HFmode: - half_mode = V8HFmode; - j = 6; - n = 8; - goto half; + if (TARGET_AVX2) + { + mmode = SImode; + gen_blendm = gen_avx2_pblendph; + blendm_const = true; + break; + } + else + { + half_mode = V8HFmode; + j = 6; + n = 8; + goto half; + } case E_V16HImode: half_mode = V8HImode; @@ -15560,15 +15651,15 @@ quarter: { tmp = gen_reg_rtx (mode); emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val))); + rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode); /* The avx512*_blendm<mode> expanders have different operand order from VEC_MERGE. In VEC_MERGE, the first input operand is used for elements where the mask is set and second input operand otherwise, in {sse,avx}*_*blend* the first input operand is used for elements where the mask is clear and second input operand otherwise. */ - emit_insn (gen_blendm (target, target, tmp, - force_reg (mmode, - gen_int_mode (HOST_WIDE_INT_1U << elt, - mmode)))); + if (!blendm_const) + merge_mask = force_reg (mmode, merge_mask); + emit_insn (gen_blendm (target, target, tmp, merge_mask)); } else if (use_vec_merge) { diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index 5a99ea7..a525a83 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -2210,15 +2210,34 @@ remove_partial_avx_dependency (void) != AVX_PARTIAL_XMM_UPDATE_TRUE) continue; - if (!v4sf_const0) - v4sf_const0 = gen_reg_rtx (V4SFmode); - /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF, SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and vec_merge with subreg. */ rtx src = SET_SRC (set); rtx dest = SET_DEST (set); machine_mode dest_mode = GET_MODE (dest); + machine_mode src_mode = GET_MODE (XEXP (src, 0)); + + switch (src_mode) + { + case E_SFmode: + case E_DFmode: + if (TARGET_USE_VECTOR_FP_CONVERTS + || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY) + continue; + break; + case E_SImode: + case E_DImode: + if (TARGET_USE_VECTOR_CONVERTS + || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY) + continue; + break; + default: + break; + } + + if (!v4sf_const0) + v4sf_const0 = gen_reg_rtx (V4SFmode); rtx zero; machine_mode dest_vecmode; diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def index fcadfcd..2a2c8b8 100644 --- a/gcc/config/i386/i386-modes.def +++ b/gcc/config/i386/i386-modes.def @@ -90,6 +90,8 @@ VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */ VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */ VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */ VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */ +VECTOR_MODE (FLOAT, HF, 2); /* V2HF */ +VECTOR_MODE (FLOAT, HF, 6); /* V6HF */ VECTOR_MODE (INT, TI, 1); /* V1TI */ VECTOR_MODE (INT, DI, 1); /* V1DI */ VECTOR_MODE (INT, SI, 1); /* V1SI */ diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index c0006b3..e7a3bd4 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -724,7 +724,7 @@ static const struct processor_costs *processor_cost_table[] = &slm_cost, &slm_cost, &slm_cost, - &slm_cost, + &tremont_cost, &slm_cost, &slm_cost, &skylake_cost, diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index dcae34b..708834a 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -320,7 +320,7 @@ struct ix86_address addr_space_t seg; }; -extern int ix86_decompose_address (rtx, struct ix86_address *); +extern bool ix86_decompose_address (rtx, struct ix86_address *); extern int memory_address_length (rtx, bool); extern void x86_output_aligned_bss (FILE *, tree, const char *, unsigned HOST_WIDE_INT, unsigned); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7b173bc..afc2674 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -10101,10 +10101,10 @@ ix86_live_on_entry (bitmap regs) } /* Extract the parts of an RTL expression that is a valid memory address - for an instruction. Return 0 if the structure of the address is + for an instruction. Return false if the structure of the address is grossly off. */ -int +bool ix86_decompose_address (rtx addr, struct ix86_address *out) { rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX; @@ -10123,17 +10123,17 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) { addr = XEXP (addr, 0); if (CONST_INT_P (addr)) - return 0; + return false; } else if (GET_CODE (addr) == AND && const_32bit_mask (XEXP (addr, 1), DImode)) { addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode); if (addr == NULL_RTX) - return 0; + return false; if (CONST_INT_P (addr)) - return 0; + return false; } else if (GET_CODE (addr) == AND) { @@ -10167,7 +10167,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) { addr = SUBREG_REG (addr); if (CONST_INT_P (addr)) - return 0; + return false; } } @@ -10178,7 +10178,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) if (REG_P (SUBREG_REG (addr))) base = addr; else - return 0; + return false; } else if (GET_CODE (addr) == PLUS) { @@ -10189,13 +10189,13 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) do { if (n >= 4) - return 0; + return false; addends[n++] = XEXP (op, 1); op = XEXP (op, 0); } while (GET_CODE (op) == PLUS); if (n >= 4) - return 0; + return false; addends[n] = op; for (i = n; i >= 0; --i) @@ -10205,28 +10205,28 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) { case MULT: if (index) - return 0; + return false; index = XEXP (op, 0); scale_rtx = XEXP (op, 1); break; case ASHIFT: if (index) - return 0; + return false; index = XEXP (op, 0); tmp = XEXP (op, 1); if (!CONST_INT_P (tmp)) - return 0; + return false; scale = INTVAL (tmp); if ((unsigned HOST_WIDE_INT) scale > 3) - return 0; + return false; scale = 1 << scale; break; case ZERO_EXTEND: op = XEXP (op, 0); if (GET_CODE (op) != UNSPEC) - return 0; + return false; /* FALLTHRU */ case UNSPEC: @@ -10235,12 +10235,12 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) && seg == ADDR_SPACE_GENERIC) seg = DEFAULT_TLS_SEG_REG; else - return 0; + return false; break; case SUBREG: if (!REG_P (SUBREG_REG (op))) - return 0; + return false; /* FALLTHRU */ case REG: @@ -10249,7 +10249,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) else if (!index) index = op; else - return 0; + return false; break; case CONST: @@ -10257,12 +10257,12 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) case SYMBOL_REF: case LABEL_REF: if (disp) - return 0; + return false; disp = op; break; default: - return 0; + return false; } } } @@ -10277,10 +10277,10 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) index = XEXP (addr, 0); tmp = XEXP (addr, 1); if (!CONST_INT_P (tmp)) - return 0; + return false; scale = INTVAL (tmp); if ((unsigned HOST_WIDE_INT) scale > 3) - return 0; + return false; scale = 1 << scale; } else @@ -10294,14 +10294,14 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) && REG_P (SUBREG_REG (index))) ; else - return 0; + return false; } /* Extract the integral value of scale. */ if (scale_rtx) { if (!CONST_INT_P (scale_rtx)) - return 0; + return false; scale = INTVAL (scale_rtx); } @@ -10354,7 +10354,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) out->scale = scale; out->seg = seg; - return 1; + return true; } /* Return cost of the memory address x. @@ -16976,6 +16976,7 @@ ix86_sched_init_global (FILE *, int, int) case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: case PROCESSOR_HASWELL: + case PROCESSOR_TREMONT: case PROCESSOR_GENERIC: /* Do not perform multipass scheduling for pre-reload schedule to save compile time. */ @@ -19443,8 +19444,11 @@ ix86_can_change_mode_class (machine_mode from, machine_mode to, /* Vector registers do not support QI or HImode loads. If we don't disallow a change to these modes, reload will assume it's ok to drop the subreg from (subreg:SI (reg:HI 100) 0). This affects - the vec_dupv4hi pattern. */ - if (GET_MODE_SIZE (from) < 4) + the vec_dupv4hi pattern. + NB: AVX512FP16 supports vmovw which can load 16bit data to sse + register. */ + int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 4; + if (GET_MODE_SIZE (from) < mov_size) return false; } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index e76bb55..ec60b89 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -334,6 +334,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_PARTIAL_REG_DEPENDENCY] #define TARGET_SSE_PARTIAL_REG_DEPENDENCY \ ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY] +#define TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY \ + ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY] +#define TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY \ + ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY] #define TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \ ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL] #define TARGET_SSE_UNALIGNED_STORE_OPTIMAL \ diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 13f6f57..c82a9dc 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4535,7 +4535,8 @@ (float_extend:DF (match_operand:SF 1 "nonimmediate_operand")))] "!TARGET_AVX - && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed + && TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY + && epilogue_completed && optimize_function_for_speed_p (cfun) && (!REG_P (operands[1]) || (!TARGET_AVX && REGNO (operands[0]) != REGNO (operands[1]))) @@ -4708,7 +4709,8 @@ (float_truncate:SF (match_operand:DF 1 "nonimmediate_operand")))] "!TARGET_AVX - && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed + && TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY + && epilogue_completed && optimize_function_for_speed_p (cfun) && (!REG_P (operands[1]) || (!TARGET_AVX && REGNO (operands[0]) != REGNO (operands[1]))) @@ -5243,7 +5245,8 @@ [(set (match_operand:MODEF 0 "sse_reg_operand") (float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))] "!TARGET_AVX - && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed + && TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY + && epilogue_completed && optimize_function_for_speed_p (cfun) && (!EXT_REX_SSE_REG_P (operands[0]) || TARGET_AVX512VL)" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 516eb45..d7a1328 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -396,6 +396,13 @@ (define_mode_iterator VF1_AVX512ER_128_256 [(V16SF "TARGET_AVX512ER") (V8SF "TARGET_AVX") V4SF]) +(define_mode_iterator VFH_AVX512VL + [(V32HF "TARGET_AVX512FP16") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") + V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) + (define_mode_iterator VF2_AVX512VL [V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) @@ -405,6 +412,9 @@ (define_mode_iterator VF_AVX512FP16 [V32HF V16HF V8HF]) +(define_mode_iterator VF_AVX512FP16VL + [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")]) + ;; All vector integer modes (define_mode_iterator VI [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") @@ -493,6 +503,11 @@ (define_mode_iterator VI2_AVX512VL [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI]) +(define_mode_iterator VI2H_AVX512VL + [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI + (V8SI "TARGET_AVX512VL") V16SI + V8DI ]) + (define_mode_iterator VI1_AVX512VL_F [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F")]) @@ -622,6 +637,9 @@ (V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2") (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")]) +(define_mode_iterator VF4_128_8_256 + [V4DF V4SF]) + (define_mode_iterator VI1_AVX512VLBW [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL") (V16QI "TARGET_AVX512VL")]) @@ -707,7 +725,8 @@ [(V16SF "V4SF") (V8DF "V2DF") (V16SI "TI") (V8DI "TI")]) (define_mode_attr vecmemsuffix - [(V16SF "{z}") (V8SF "{y}") (V4SF "{x}") + [(V32HF "{z}") (V16HF "{y}") (V8HF "{x}") + (V16SF "{z}") (V8SF "{y}") (V4SF "{x}") (V8DF "{z}") (V4DF "{y}") (V2DF "{x}")]) (define_mode_attr ssedoublemodelower @@ -727,6 +746,11 @@ [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI") (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")]) +(define_mode_attr sseintconvert + [(V32HI "w") (V16HI "w") (V8HI "w") + (V16SI "dq") (V8SI "dq") (V4SI "dq") + (V8DI "qq") (V4DI "qq") (V2DI "qq")]) + ;; All 128bit vector integer modes (define_mode_iterator VI_128 [V16QI V8HI V4SI V2DI]) @@ -768,6 +792,7 @@ (V32HF "TARGET_AVX512BW")]) ;; Int-float size matches +(define_mode_iterator VI2F [V8HI V16HI V32HI V8HF V16HF V32HF]) (define_mode_iterator VI4F_128 [V4SI V4SF]) (define_mode_iterator VI8F_128 [V2DI V2DF]) (define_mode_iterator VI4F_256 [V8SI V8SF]) @@ -782,6 +807,12 @@ (V4DI "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")]) (define_mode_iterator VF48_I1248 [V16SI V16SF V8DI V8DF V32HI V64QI]) +(define_mode_iterator VF48H_AVX512VL + [V8DF V16SF (V8SF "TARGET_AVX512VL")]) + +(define_mode_iterator VF48_128 + [V2DF V4SF]) + (define_mode_iterator VI48F [V16SI V16SF V8DI V8DF (V8SI "TARGET_AVX512VL") (V8SF "TARGET_AVX512VL") @@ -806,6 +837,7 @@ (V8SF "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL") V16SF V8DF]) +(define_mode_iterator V8_128 [V8HI V8HF]) (define_mode_iterator V16_256 [V16HI V16HF]) (define_mode_iterator V32_512 [V32HI V32HF]) @@ -918,9 +950,9 @@ ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr sseintvecmode - [(V16SF "V16SI") (V8DF "V8DI") - (V8SF "V8SI") (V4DF "V4DI") - (V4SF "V4SI") (V2DF "V2DI") + [(V32HF "V32HI") (V16SF "V16SI") (V8DF "V8DI") + (V16HF "V16HI") (V8SF "V8SI") (V4DF "V4DI") + (V8HF "V8HI") (V4SF "V4SI") (V2DF "V2DI") (V16SI "V16SI") (V8DI "V8DI") (V8SI "V8SI") (V4DI "V4DI") (V4SI "V4SI") (V2DI "V2DI") @@ -971,6 +1003,13 @@ (V4SF "v2sf") (V32HF "v16hf") (V16HF "v8hf") (V8HF "v4hf")]) +;; Mapping of vector modes to vector hf modes of conversion. +(define_mode_attr ssePHmode + [(V32HI "V32HF") (V16HI "V16HF") (V8HI "V8HF") + (V16SI "V16HF") (V8SI "V8HF") (V4SI "V8HF") + (V8DI "V8HF") (V4DI "V8HF") (V2DI "V8HF") + (V8DF "V8HF") (V16SF "V16HF") (V8SF "V8HF")]) + ;; Mapping of vector modes to packed single mode of the same size (define_mode_attr ssePSmode [(V16SI "V16SF") (V8DF "V16SF") @@ -1116,7 +1155,8 @@ ;; Mapping of mode to cast intrinsic name (define_mode_attr castmode - [(V8SI "si") (V8SF "ps") (V4DF "pd") + [(V4SF "ps") (V2DF "pd") + (V8SI "si") (V8SF "ps") (V4DF "pd") (V16SI "si") (V16SF "ps") (V8DF "pd")]) ;; i128 for integer vectors and TARGET_AVX2, f128 otherwise. @@ -1349,13 +1389,13 @@ [(set (match_dup 0) (match_dup 1))]) (define_insn "avx512f_mov<ssescalarmodelower>_mask" - [(set (match_operand:VF_128 0 "register_operand" "=v") - (vec_merge:VF_128 - (vec_merge:VF_128 - (match_operand:VF_128 2 "register_operand" "v") - (match_operand:VF_128 3 "nonimm_or_0_operand" "0C") + [(set (match_operand:VFH_128 0 "register_operand" "=v") + (vec_merge:VFH_128 + (vec_merge:VFH_128 + (match_operand:VFH_128 2 "register_operand" "v") + (match_operand:VFH_128 3 "nonimm_or_0_operand" "0C") (match_operand:QI 4 "register_operand" "Yk")) - (match_operand:VF_128 1 "register_operand" "v") + (match_operand:VFH_128 1 "register_operand" "v") (const_int 1)))] "TARGET_AVX512F" "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}" @@ -1368,7 +1408,7 @@ (vec_merge:<ssevecmode> (vec_merge:<ssevecmode> (vec_duplicate:<ssevecmode> - (match_operand:MODEF 1 "memory_operand")) + (match_operand:MODEFH 1 "memory_operand")) (match_operand:<ssevecmode> 2 "nonimm_or_0_operand") (match_operand:QI 3 "register_operand")) (match_dup 4) @@ -1381,7 +1421,7 @@ (vec_merge:<ssevecmode> (vec_merge:<ssevecmode> (vec_duplicate:<ssevecmode> - (match_operand:MODEF 1 "memory_operand" "m")) + (match_operand:MODEFH 1 "memory_operand" "m")) (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C") (match_operand:QI 3 "register_operand" "Yk")) (match_operand:<ssevecmode> 4 "const0_operand" "C") @@ -1394,11 +1434,11 @@ (set_attr "mode" "<MODE>")]) (define_insn "avx512f_store<mode>_mask" - [(set (match_operand:MODEF 0 "memory_operand" "=m") - (if_then_else:MODEF + [(set (match_operand:MODEFH 0 "memory_operand" "=m") + (if_then_else:MODEFH (and:QI (match_operand:QI 2 "register_operand" "Yk") (const_int 1)) - (vec_select:MODEF + (vec_select:MODEFH (match_operand:<ssevecmode> 1 "register_operand" "v") (parallel [(const_int 0)])) (match_dup 0)))] @@ -2338,6 +2378,30 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "SF")]) +(define_insn "avx512fp16_rcp<mode>2<mask_name>" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=v") + (unspec:VF_AVX512FP16VL + [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "vm")] + UNSPEC_RCP))] + "TARGET_AVX512FP16" + "vrcpph\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512fp16_vmrcpv8hf2<mask_scalar_name>" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_merge:V8HF + (unspec:V8HF [(match_operand:V8HF 1 "nonimmediate_operand" "vm")] + UNSPEC_RCP) + (match_operand:V8HF 2 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512FP16" + "vrcpsh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %w1}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "HF")]) + (define_insn "<mask_codefor>rcp14<mode><mask_name>" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") (unspec:VF_AVX512VL @@ -2381,8 +2445,8 @@ (set_attr "mode" "<MODE>")]) (define_expand "sqrt<mode>2" - [(set (match_operand:VF2 0 "register_operand") - (sqrt:VF2 (match_operand:VF2 1 "vector_operand")))] + [(set (match_operand:VF2H 0 "register_operand") + (sqrt:VF2H (match_operand:VF2H 1 "vector_operand")))] "TARGET_SSE2") (define_expand "sqrt<mode>2" @@ -2402,8 +2466,8 @@ }) (define_insn "<sse>_sqrt<mode>2<mask_name><round_name>" - [(set (match_operand:VF 0 "register_operand" "=x,v") - (sqrt:VF (match_operand:VF 1 "<round_nimm_predicate>" "xBm,<round_constraint>")))] + [(set (match_operand:VFH 0 "register_operand" "=x,v") + (sqrt:VFH (match_operand:VFH 1 "<round_nimm_predicate>" "xBm,<round_constraint>")))] "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>" "@ sqrt<ssemodesuffix>\t{%1, %0|%0, %1} @@ -2416,11 +2480,11 @@ (set_attr "mode" "<MODE>")]) (define_insn "<sse>_vmsqrt<mode>2<mask_scalar_name><round_scalar_name>" - [(set (match_operand:VF_128 0 "register_operand" "=x,v") - (vec_merge:VF_128 - (sqrt:VF_128 - (match_operand:VF_128 1 "nonimmediate_operand" "xm,<round_scalar_constraint>")) - (match_operand:VF_128 2 "register_operand" "0,v") + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") + (vec_merge:VFH_128 + (sqrt:VFH_128 + (match_operand:VFH_128 1 "nonimmediate_operand" "xm,<round_scalar_constraint>")) + (match_operand:VFH_128 2 "register_operand" "0,v") (const_int 1)))] "TARGET_SSE" "@ @@ -2473,6 +2537,16 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "<MODE>")]) +(define_insn "<sse>_rsqrt<mode>2<mask_name>" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=v") + (unspec:VF_AVX512FP16VL + [(match_operand:VF_AVX512FP16VL 1 "vector_operand" "vBm")] UNSPEC_RSQRT))] + "TARGET_AVX512FP16" + "vrsqrtph\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + (define_insn "<mask_codefor>rsqrt14<mode><mask_name>" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") (unspec:VF_AVX512VL @@ -2548,6 +2622,19 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "SF")]) +(define_insn "avx512fp16_vmrsqrtv8hf2<mask_scalar_name>" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_merge:V8HF + (unspec:V8HF [(match_operand:V8HF 1 "nonimmediate_operand" "vm")] + UNSPEC_RSQRT) + (match_operand:V8HF 2 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512FP16" + "vrsqrtsh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %w1}" + [(set_attr "type" "sse") + (set_attr "prefix" "evex") + (set_attr "mode" "HF")]) + (define_expand "cond_<code><mode>" [(set (match_operand:VF 0 "register_operand") (vec_merge:VF @@ -3200,28 +3287,28 @@ }) (define_insn "<mask_codefor>reducep<mode><mask_name><round_saeonly_name>" - [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") - (unspec:VF_AVX512VL - [(match_operand:VF_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") + [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v") + (unspec:VFH_AVX512VL + [(match_operand:VFH_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 2 "const_0_to_255_operand")] UNSPEC_REDUCE))] - "TARGET_AVX512DQ" + "TARGET_AVX512DQ || (VALID_AVX512FP16_REG_MODE (<MODE>mode))" "vreduce<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}" [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) (define_insn "reduces<mode><mask_scalar_name><round_saeonly_scalar_name>" - [(set (match_operand:VF_128 0 "register_operand" "=v") - (vec_merge:VF_128 - (unspec:VF_128 - [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>") + [(set (match_operand:VFH_128 0 "register_operand" "=v") + (vec_merge:VFH_128 + (unspec:VFH_128 + [(match_operand:VFH_128 1 "register_operand" "v") + (match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>") (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_REDUCE) (match_dup 1) (const_int 1)))] - "TARGET_AVX512DQ" + "TARGET_AVX512DQ || (VALID_AVX512FP16_REG_MODE (<MODE>mode))" "vreduce<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}" [(set_attr "type" "sse") (set_attr "prefix" "evex") @@ -5655,6 +5742,552 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; +;; Parallel half-precision floating point conversion operations +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_int_iterator UNSPEC_US_FIX_NOTRUNC + [UNSPEC_UNSIGNED_FIX_NOTRUNC UNSPEC_FIX_NOTRUNC]) + +(define_int_attr sseintconvertsignprefix + [(UNSPEC_UNSIGNED_FIX_NOTRUNC "u") + (UNSPEC_FIX_NOTRUNC "")]) + +(define_mode_attr qq2phsuff + [(V32HI "") (V16HI "") (V8HI "") + (V16SI "") (V8SI "{y}") (V4SI "{x}") + (V8DI "{z}") (V4DI "{y}") (V2DI "{x}") + (V16SF "") (V8SF "{y}") (V4SF "{x}") + (V8DF "{z}") (V4DF "{y}") (V2DF "{x}")]) + +(define_insn "avx512fp16_vcvtph2<sseintconvertsignprefix><sseintconvert>_<mode><mask_name><round_name>" + [(set (match_operand:VI248_AVX512VL 0 "register_operand" "=v") + (unspec:VI248_AVX512VL + [(match_operand:<ssePHmode> 1 "<round_nimm_predicate>" "<round_constraint>")] + UNSPEC_US_FIX_NOTRUNC))] + "TARGET_AVX512FP16" + "vcvtph2<sseintconvertsignprefix><sseintconvert>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode><mask_name><round_name>" + [(set (match_operand:<ssePHmode> 0 "register_operand" "=v") + (any_float:<ssePHmode> + (match_operand:VI2H_AVX512VL 1 "<round_nimm_predicate>" "<round_constraint>")))] + "TARGET_AVX512FP16" + "vcvt<floatsuffix><sseintconvert>2ph<round_qq2phsuff>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm")) + (match_dup 2)))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "operands[2] = CONST0_RTX (V4HFmode);") + +(define_insn "*avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm")) + (match_operand:V4HF 2 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>_mask" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V4HF + (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm")) + (vec_select:V4HF + (match_operand:V8HF 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)])) + (match_operand:QI 3 "register_operand" "Yk")) + (match_dup 4)))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "operands[4] = CONST0_RTX (V4HFmode);") + +(define_insn "*avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>_mask" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V4HF + (any_float:V4HF (match_operand:VI4_128_8_256 1 "vector_operand" "vm")) + (vec_select:V4HF + (match_operand:V8HF 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)])) + (match_operand:QI 3 "register_operand" "Yk")) + (match_operand:V4HF 4 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx512fp16_vcvt<floatsuffix><sseintconvert>2ph_<mode>_mask_1" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V4HF + (any_float:V4HF (match_operand:VI4_128_8_256 1 + "vector_operand" "vm")) + (match_operand:V4HF 3 "const0_operand" "C") + (match_operand:QI 2 "register_operand" "Yk")) + (match_operand:V4HF 4 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvt<floatsuffix><sseintconvert>2ph<qq2phsuff>\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx512fp16_vcvt<floatsuffix>qq2ph_v2di" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm")) + (match_dup 2)))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "operands[2] = CONST0_RTX (V6HFmode);") + +(define_insn "*avx512fp16_vcvt<floatsuffix>qq2ph_v2di" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm")) + (match_operand:V6HF 2 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvt<floatsuffix>qq2ph{x}\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_expand "avx512fp16_vcvt<floatsuffix>qq2ph_v2di_mask" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V2HF + (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm")) + (vec_select:V2HF + (match_operand:V8HF 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand" "Yk")) + (match_dup 4)))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "operands[4] = CONST0_RTX (V6HFmode);") + +(define_insn "*avx512fp16_vcvt<floatsuffix>qq2ph_v2di_mask" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V2HF + (any_float:V2HF (match_operand:V2DI 1 "vector_operand" "vm")) + (vec_select:V2HF + (match_operand:V8HF 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand" "Yk")) + (match_operand:V6HF 4 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvt<floatsuffix>qq2ph{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "*avx512fp16_vcvt<floatsuffix>qq2ph_v2di_mask_1" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V2HF + (any_float:V2HF (match_operand:V2DI 1 + "vector_operand" "vm")) + (match_operand:V2HF 3 "const0_operand" "C") + (match_operand:QI 2 "register_operand" "Yk")) + (match_operand:V6HF 4 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvt<floatsuffix>qq2ph{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "avx512fp16_vcvtsh2<sseintconvertsignprefix>si<rex64namesuffix><round_name>" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (unspec:SWI48 + [(vec_select:HF + (match_operand:V8HF 1 "register_operand" "v") + (parallel [(const_int 0)]))] + UNSPEC_US_FIX_NOTRUNC))] + "TARGET_AVX512FP16" + "vcvtsh2<sseintconvertsignprefix>si\t{<round_op2>%1, %0|%0, %1<round_op2>}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512fp16_vcvtsh2<sseintconvertsignprefix>si<rex64namesuffix>_2" + [(set (match_operand:SWI48 0 "register_operand" "=r,r") + (unspec:SWI48 + [(match_operand:HF 1 "nonimmediate_operand" "v,m")] + UNSPEC_US_FIX_NOTRUNC))] + "TARGET_AVX512FP16" + "vcvtsh2<sseintconvertsignprefix>si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_mode_attr sseicvtsuffix + [(SI "l") (DI "q")]) +(define_insn "avx512fp16_vcvt<floatsuffix>si2sh<rex64namesuffix><round_name>" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_merge:V8HF + (vec_duplicate:V8HF + (any_float:HF + (match_operand:SWI48 2 "<round_nimm_scalar_predicate>" "<round_constraint3>"))) + (match_operand:V8HF 1 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512FP16" + "vcvt<floatsuffix>si2sh{<sseicvtsuffix>}\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "evex") + (set_attr "mode" "HF")]) + +(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name><round_saeonly_name>" + [(set (match_operand:VI2H_AVX512VL 0 "register_operand" "=v") + (any_fix:VI2H_AVX512VL + (match_operand:<ssePHmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))] + "TARGET_AVX512FP16" + "vcvttph2<fixsuffix><sseintconvert>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<mask_name>" + [(set (match_operand:VI4_128_8_256 0 "register_operand" "=v") + (any_fix:VI4_128_8_256 + (vec_select:V4HF + (match_operand:V8HF 1 "register_operand" "v") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvttph2<fixsuffix><sseintconvert>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx512fp16_fix<fixunssuffix>_trunc<mode>2_load<mask_name>" + [(set (match_operand:VI4_128_8_256 0 "register_operand" "=v") + (any_fix:VI4_128_8_256 + (match_operand:V4HF 1 "memory_operand" "m")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvttph2<fixsuffix><sseintconvert>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512fp16_fix<fixunssuffix>_truncv2di2<mask_name>" + [(set (match_operand:V2DI 0 "register_operand" "=v") + (any_fix:V2DI + (vec_select:V2HF + (match_operand:V8HF 1 "nonimmediate_operand" "v") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvttph2<fixsuffix>qq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "*avx512fp16_fix<fixunssuffix>_truncv2di2_load<mask_name>" + [(set (match_operand:V2DI 0 "register_operand" "=v") + (any_fix:V2DI + (match_operand:V2HF 1 "memory_operand" "m")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvttph2<fixsuffix>qq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2<round_saeonly_name>" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (any_fix:SWI48 + (vec_select:HF + (match_operand:V8HF 1 "register_operand" "v") + (parallel [(const_int 0)]))))] + "TARGET_AVX512FP16" + "%vcvttsh2<fixsuffix>si\t{<round_saeonly_op2>%1, %0|%0, %k1<round_saeonly_op2>}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512fp16_fix<fixunssuffix>_trunc<mode>2_mem" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (any_fix:SWI48 + (match_operand:HF 1 "memory_operand" "vm")))] + "TARGET_AVX512FP16" + "%vcvttsh2<fixsuffix>si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_mode_attr ph2pssuffix + [(V16SF "x") (V8SF "x") (V4SF "x") + (V8DF "") (V4DF "") (V2DF "")]) + +(define_insn "avx512fp16_float_extend_ph<mode>2<mask_name><round_saeonly_name>" + [(set (match_operand:VF48H_AVX512VL 0 "register_operand" "=v") + (float_extend:VF48H_AVX512VL + (match_operand:<ssePHmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))] + "TARGET_AVX512FP16" + "vcvtph2<castmode><ph2pssuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512fp16_float_extend_ph<mode>2<mask_name>" + [(set (match_operand:VF4_128_8_256 0 "register_operand" "=v") + (float_extend:VF4_128_8_256 + (vec_select:V4HF + (match_operand:V8HF 1 "register_operand" "v") + (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvtph2<castmode><ph2pssuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx512fp16_float_extend_ph<mode>2_load<mask_name>" + [(set (match_operand:VF4_128_8_256 0 "register_operand" "=v") + (float_extend:VF4_128_8_256 + (match_operand:V4HF 1 "memory_operand" "m")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvtph2<castmode><ph2pssuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "avx512fp16_float_extend_phv2df2<mask_name>" + [(set (match_operand:V2DF 0 "register_operand" "=v") + (float_extend:V2DF + (vec_select:V2HF + (match_operand:V8HF 1 "register_operand" "v") + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvtph2pd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "*avx512fp16_float_extend_phv2df2_load<mask_name>" + [(set (match_operand:V2DF 0 "register_operand" "=v") + (float_extend:V2DF + (match_operand:V2HF 1 "memory_operand" "m")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvtph2pd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "avx512fp16_vcvt<castmode>2ph_<mode><mask_name><round_name>" + [(set (match_operand:<ssePHmode> 0 "register_operand" "=v") + (float_truncate:<ssePHmode> + (match_operand:VF48H_AVX512VL 1 "<round_nimm_predicate>" "<round_constraint>")))] + "TARGET_AVX512FP16" + "vcvt<castmode>2ph<ph2pssuffix><round_qq2phsuff>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx512fp16_vcvt<castmode>2ph_<mode>" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (float_truncate:V4HF + (match_operand:VF4_128_8_256 1 "vector_operand" "vm")) + (match_dup 2)))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "operands[2] = CONST0_RTX (V4HFmode);") + +(define_insn "*avx512fp16_vcvt<castmode>2ph_<mode>" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (float_truncate:V4HF + (match_operand:VF4_128_8_256 1 "vector_operand" "vm")) + (match_operand:V4HF 2 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx512fp16_vcvt<castmode>2ph_<mode>_mask" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V4HF + (float_truncate:V4HF + (match_operand:VF4_128_8_256 1 "vector_operand" "vm")) + (vec_select:V4HF + (match_operand:V8HF 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])) + (match_operand:QI 3 "register_operand" "Yk")) + (match_dup 4)))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "operands[4] = CONST0_RTX (V4HFmode);") + +(define_insn "*avx512fp16_vcvt<castmode>2ph_<mode>_mask" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V4HF + (float_truncate:V4HF + (match_operand:VF4_128_8_256 1 "vector_operand" "vm")) + (vec_select:V4HF + (match_operand:V8HF 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)])) + (match_operand:QI 3 "register_operand" "Yk")) + (match_operand:V4HF 4 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx512fp16_vcvt<castmode>2ph_<mode>_mask_1" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V4HF + (float_truncate:V4HF + (match_operand:VF4_128_8_256 1 "vector_operand" "vm")) + (match_operand:V4HF 3 "const0_operand" "C") + (match_operand:QI 2 "register_operand" "Yk")) + (match_operand:V4HF 4 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvt<castmode>2ph<ph2pssuffix><qq2phsuff>\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "avx512fp16_vcvtpd2ph_v2df" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (float_truncate:V2HF + (match_operand:V2DF 1 "vector_operand" "vm")) + (match_dup 2)))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "operands[2] = CONST0_RTX (V6HFmode);") + +(define_insn "*avx512fp16_vcvtpd2ph_v2df" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (float_truncate:V2HF + (match_operand:V2DF 1 "vector_operand" "vm")) + (match_operand:V6HF 2 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvtpd2ph{x}\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_expand "avx512fp16_vcvtpd2ph_v2df_mask" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V2HF + (float_truncate:V2HF + (match_operand:V2DF 1 "vector_operand" "vm")) + (vec_select:V2HF + (match_operand:V8HF 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand" "Yk")) + (match_dup 4)))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "operands[4] = CONST0_RTX (V6HFmode);") + +(define_insn "*avx512fp16_vcvtpd2ph_v2df_mask" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V2HF + (float_truncate:V2HF (match_operand:V2DF 1 "vector_operand" "vm")) + (vec_select:V2HF + (match_operand:V8HF 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand" "Yk")) + (match_operand:V6HF 4 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvtpd2ph{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "*avx512fp16_vcvtpd2ph_v2df_mask_1" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_concat:V8HF + (vec_merge:V2HF + (float_truncate:V2HF + (match_operand:V2DF 1 "vector_operand" "vm")) + (match_operand:V2HF 3 "const0_operand" "C") + (match_operand:QI 2 "register_operand" "Yk")) + (match_operand:V6HF 4 "const0_operand" "C")))] + "TARGET_AVX512FP16 && TARGET_AVX512VL" + "vcvtpd2ph{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "avx512fp16_vcvtsh2<ssescalarmodesuffix><mask_scalar_name><round_saeonly_scalar_name>" + [(set (match_operand:VF48_128 0 "register_operand" "=v") + (vec_merge:VF48_128 + (vec_duplicate:VF48_128 + (float_extend:<ssescalarmode> + (vec_select:HF + (match_operand:V8HF 1 "register_operand" "v") + (parallel [(const_int 0)])))) + (match_operand:VF48_128 2 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512FP16" + "vcvtsh2<ssescalarmodesuffix>\t{<round_saeonly_scalar_mask_op3>%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1<round_saeonly_scalar_mask_op3>}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "avx512fp16_vcvtsh2<ssescalarmodesuffix><mask_scalar_name>_mem" + [(set (match_operand:VF48_128 0 "register_operand" "=v") + (vec_merge:VF48_128 + (vec_duplicate:VF48_128 + (float_extend:<ssescalarmode> + (match_operand:HF 1 "memory_operand" "m"))) + (match_operand:VF48_128 2 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512FP16" + "vcvtsh2<ssescalarmodesuffix>\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "avx512fp16_vcvt<ssescalarmodesuffix>2sh<mask_scalar_name><round_scalar_name>" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_merge:V8HF + (vec_duplicate:V8HF + (float_truncate:HF + (vec_select:<ssescalarmode> + (match_operand:VF48_128 1 "register_operand" "v") + (parallel [(const_int 0)])))) + (match_operand:V8HF 2 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512FP16" + "vcvt<ssescalarmodesuffix>2sh\t{<round_scalar_mask_op3>%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1<round_scalar_mask_op3>}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "avx512fp16_vcvt<ssescalarmodesuffix>2sh<mask_scalar_name>_mem" + [(set (match_operand:V8HF 0 "register_operand" "=v") + (vec_merge:V8HF + (vec_duplicate:V8HF + (float_truncate:HF + (match_operand:MODEF 1 "memory_operand" "m"))) + (match_operand:V8HF 2 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512FP16" + "vcvt<ssescalarmodesuffix>2sh\t{%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ;; Parallel single-precision floating point conversion operations ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -8759,11 +9392,11 @@ ;; vmovw clears also the higer bits (define_insn "vec_set<mode>_0" - [(set (match_operand:VF_AVX512FP16 0 "register_operand" "=v,v") - (vec_merge:VF_AVX512FP16 - (vec_duplicate:VF_AVX512FP16 - (match_operand:HF 2 "nonimmediate_operand" "r,m")) - (match_operand:VF_AVX512FP16 1 "const0_operand" "C,C") + [(set (match_operand:VI2F 0 "register_operand" "=v,v") + (vec_merge:VI2F + (vec_duplicate:VI2F + (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,m")) + (match_operand:VI2F 1 "const0_operand" "C,C") (const_int 1)))] "TARGET_AVX512FP16" "@ @@ -9031,7 +9664,8 @@ [(V16SF "avx512f") (V16SI "avx512f") (V8DF "avx512dq") (V8DI "avx512dq")]) (define_mode_attr extract_suf - [(V16SF "32x4") (V16SI "32x4") (V8DF "64x2") (V8DI "64x2")]) + [(V16SF "32x4") (V16SI "32x4") (V8DF "64x2") (V8DI "64x2") + (V8SF "32x4") (V8SI "32x4") (V4DF "64x2") (V4DI "64x2")]) (define_mode_iterator AVX512_VEC [(V8DF "TARGET_AVX512DQ") (V8DI "TARGET_AVX512DQ") V16SF V16SI]) @@ -9891,16 +10525,33 @@ "operands[1] = gen_lowpart (HFmode, operands[1]);") (define_insn "*vec_extracthf" - [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=r,m") + [(set (match_operand:HF 0 "register_sse4nonimm_operand" "=*r,m,x,v") (vec_select:HF - (match_operand:V8HF 1 "register_operand" "v,v") + (match_operand:V8HF 1 "register_operand" "v,v,0,v") (parallel [(match_operand:SI 2 "const_0_to_7_operand")])))] "TARGET_SSE2" - "@ - vpextrw\t{%2, %1, %k0|%k0, %1, %2} - vpextrw\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sselog1") +{ + switch (which_alternative) + { + case 0: + return "vpextrw\t{%2, %1, %k0|%k0, %1, %2}"; + case 1: + return "vpextrw\t{%2, %1, %0|%0, %1, %2}"; + + case 2: + operands[2] = GEN_INT (INTVAL (operands[2]) * 2); + return "psrldq\t{%2, %0|%0, %2}"; + case 3: + operands[2] = GEN_INT (INTVAL (operands[2]) * 2); + return "vpsrldq\t{%2, %1, %0|%0, %1, %2}"; + + default: + gcc_unreachable (); + } +} + [(set_attr "isa" "*,*,noavx,avx") + (set_attr "type" "sselog1,sselog1,sseishft1,sseishft1") (set_attr "prefix" "maybe_evex") (set_attr "mode" "TI")]) @@ -10255,11 +10906,11 @@ }) (define_insn "avx512f_vmscalef<mode><mask_scalar_name><round_scalar_name>" - [(set (match_operand:VF_128 0 "register_operand" "=v") - (vec_merge:VF_128 - (unspec:VF_128 - [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "<round_scalar_nimm_predicate>" "<round_scalar_constraint>")] + [(set (match_operand:VFH_128 0 "register_operand" "=v") + (vec_merge:VFH_128 + (unspec:VFH_128 + [(match_operand:VFH_128 1 "register_operand" "v") + (match_operand:VFH_128 2 "<round_scalar_nimm_predicate>" "<round_scalar_constraint>")] UNSPEC_SCALEF) (match_dup 1) (const_int 1)))] @@ -10269,10 +10920,10 @@ (set_attr "mode" "<ssescalarmode>")]) (define_insn "<avx512>_scalef<mode><mask_name><round_name>" - [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") - (unspec:VF_AVX512VL - [(match_operand:VF_AVX512VL 1 "register_operand" "v") - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")] + [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v") + (unspec:VFH_AVX512VL + [(match_operand:VFH_AVX512VL 1 "register_operand" "v") + (match_operand:VFH_AVX512VL 2 "nonimmediate_operand" "<round_constraint>")] UNSPEC_SCALEF))] "TARGET_AVX512F" "vscalef<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}" @@ -10558,8 +11209,8 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "<avx512>_getexp<mode><mask_name><round_saeonly_name>" - [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") - (unspec:VF_AVX512VL [(match_operand:VF_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")] + [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v") + (unspec:VFH_AVX512VL [(match_operand:VFH_AVX512VL 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")] UNSPEC_GETEXP))] "TARGET_AVX512F" "vgetexp<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"; @@ -10567,11 +11218,11 @@ (set_attr "mode" "<MODE>")]) (define_insn "avx512f_sgetexp<mode><mask_scalar_name><round_saeonly_scalar_name>" - [(set (match_operand:VF_128 0 "register_operand" "=v") - (vec_merge:VF_128 - (unspec:VF_128 - [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")] + [(set (match_operand:VFH_128 0 "register_operand" "=v") + (vec_merge:VFH_128 + (unspec:VFH_128 + [(match_operand:VFH_128 1 "register_operand" "v") + (match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>")] UNSPEC_GETEXP) (match_dup 1) (const_int 1)))] @@ -10603,9 +11254,21 @@ (match_operand:V48_256_512_AVX512VL 1 "register_operand" "v") (parallel [(match_operand 2 "<vec_extract_imm_predicate>")])))] "TARGET_AVX512F - && INTVAL(operands[2]) >= 16 / GET_MODE_SIZE (<ssescalarmode>mode)" - "valign<ternlogsuffix>\t{%2, %1, %1, %<xtg_mode>0|%<xtg_mode>0, %1, %1, %2}"; - [(set_attr "prefix" "evex") + && INTVAL(operands[2]) * GET_MODE_SIZE (<ssescalarmode>mode) >= 16" +{ + int byte_offset = INTVAL (operands[2]) * GET_MODE_SIZE (<ssescalarmode>mode); + if (byte_offset % 16 == 0) + { + operands[2] = GEN_INT (byte_offset / 16); + if (byte_offset / 16 == 1) + return "vextract<shuffletype><extract_suf>\t{%2, %t1, %x0|%x0, %t1, %2}"; + else + return "vextract<shuffletype><extract_suf>\t{%2, %1, %x0|%x0, %1, %2}"; + } + else + return "valign<ternlogsuffix>\t{%2, %1, %1, %<xtg_mode>0|%<xtg_mode>0, %1, %1, %2}"; +} + [(set_attr "prefix" "maybe_evex") (set_attr "mode" "<sseintvecinsnmode>")]) (define_expand "avx512f_shufps512_mask" @@ -10737,9 +11400,9 @@ (set_attr "mode" "<ssescalarmode>")]) (define_insn "<avx512>_rndscale<mode><mask_name><round_saeonly_name>" - [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") - (unspec:VF_AVX512VL - [(match_operand:VF_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>") + [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v") + (unspec:VFH_AVX512VL + [(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>") (match_operand:SI 2 "const_0_to_255_operand")] UNSPEC_ROUND))] "TARGET_AVX512F" @@ -10749,13 +11412,13 @@ (set_attr "mode" "<MODE>")]) (define_insn "avx512f_rndscale<mode><mask_scalar_name><round_saeonly_scalar_name>" - [(set (match_operand:VF_128 0 "register_operand" "=v") - (vec_merge:VF_128 - (unspec:VF_128 - [(match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>") + [(set (match_operand:VFH_128 0 "register_operand" "=v") + (vec_merge:VFH_128 + (unspec:VFH_128 + [(match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>") (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_ROUND) - (match_operand:VF_128 1 "register_operand" "v") + (match_operand:VFH_128 1 "register_operand" "v") (const_int 1)))] "TARGET_AVX512F" "vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}" @@ -10764,14 +11427,14 @@ (set_attr "mode" "<MODE>")]) (define_insn "*avx512f_rndscale<mode><round_saeonly_name>" - [(set (match_operand:VF_128 0 "register_operand" "=v") - (vec_merge:VF_128 - (vec_duplicate:VF_128 + [(set (match_operand:VFH_128 0 "register_operand" "=v") + (vec_merge:VFH_128 + (vec_duplicate:VFH_128 (unspec:<ssescalarmode> [(match_operand:<ssescalarmode> 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_ROUND)) - (match_operand:VF_128 1 "register_operand" "v") + (match_operand:VFH_128 1 "register_operand" "v") (const_int 1)))] "TARGET_AVX512F" "vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %2<round_saeonly_op4>, %3}" @@ -15359,12 +16022,12 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) -(define_insn "avx512bw_interleave_highv32hi<mask_name>" - [(set (match_operand:V32HI 0 "register_operand" "=v") - (vec_select:V32HI - (vec_concat:V64HI - (match_operand:V32HI 1 "register_operand" "v") - (match_operand:V32HI 2 "nonimmediate_operand" "vm")) +(define_insn "avx512bw_interleave_high<mode><mask_name>" + [(set (match_operand:V32_512 0 "register_operand" "=v") + (vec_select:V32_512 + (vec_concat:<ssedoublevecmode> + (match_operand:V32_512 1 "register_operand" "v") + (match_operand:V32_512 2 "nonimmediate_operand" "vm")) (parallel [(const_int 4) (const_int 36) (const_int 5) (const_int 37) (const_int 6) (const_int 38) @@ -15387,12 +16050,12 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) -(define_insn "avx2_interleave_highv16hi<mask_name>" - [(set (match_operand:V16HI 0 "register_operand" "=Yw") - (vec_select:V16HI - (vec_concat:V32HI - (match_operand:V16HI 1 "register_operand" "Yw") - (match_operand:V16HI 2 "nonimmediate_operand" "Ywm")) +(define_insn "avx2_interleave_high<mode><mask_name>" + [(set (match_operand:V16_256 0 "register_operand" "=Yw") + (vec_select:V16_256 + (vec_concat:<ssedoublevecmode> + (match_operand:V16_256 1 "register_operand" "Yw") + (match_operand:V16_256 2 "nonimmediate_operand" "Ywm")) (parallel [(const_int 4) (const_int 20) (const_int 5) (const_int 21) (const_int 6) (const_int 22) @@ -15407,12 +16070,12 @@ (set_attr "prefix" "maybe_evex") (set_attr "mode" "OI")]) -(define_insn "vec_interleave_highv8hi<mask_name>" - [(set (match_operand:V8HI 0 "register_operand" "=x,Yw") - (vec_select:V8HI - (vec_concat:V16HI - (match_operand:V8HI 1 "register_operand" "0,Yw") - (match_operand:V8HI 2 "vector_operand" "xBm,Ywm")) +(define_insn "vec_interleave_high<mode><mask_name>" + [(set (match_operand:V8_128 0 "register_operand" "=x,Yw") + (vec_select:V8_128 + (vec_concat:<ssedoublevecmode> + (match_operand:V8_128 1 "register_operand" "0,Yw") + (match_operand:V8_128 2 "vector_operand" "xBm,Ywm")) (parallel [(const_int 4) (const_int 12) (const_int 5) (const_int 13) (const_int 6) (const_int 14) @@ -15427,12 +16090,12 @@ (set_attr "prefix" "orig,maybe_vex") (set_attr "mode" "TI")]) -(define_insn "<mask_codefor>avx512bw_interleave_lowv32hi<mask_name>" - [(set (match_operand:V32HI 0 "register_operand" "=v") - (vec_select:V32HI - (vec_concat:V64HI - (match_operand:V32HI 1 "register_operand" "v") - (match_operand:V32HI 2 "nonimmediate_operand" "vm")) +(define_insn "<mask_codefor>avx512bw_interleave_low<mode><mask_name>" + [(set (match_operand:V32_512 0 "register_operand" "=v") + (vec_select:V32_512 + (vec_concat:<ssedoublevecmode> + (match_operand:V32_512 1 "register_operand" "v") + (match_operand:V32_512 2 "nonimmediate_operand" "vm")) (parallel [(const_int 0) (const_int 32) (const_int 1) (const_int 33) (const_int 2) (const_int 34) @@ -15455,12 +16118,12 @@ (set_attr "prefix" "evex") (set_attr "mode" "XI")]) -(define_insn "avx2_interleave_lowv16hi<mask_name>" - [(set (match_operand:V16HI 0 "register_operand" "=Yw") - (vec_select:V16HI - (vec_concat:V32HI - (match_operand:V16HI 1 "register_operand" "Yw") - (match_operand:V16HI 2 "nonimmediate_operand" "Ywm")) +(define_insn "avx2_interleave_low<mode><mask_name>" + [(set (match_operand:V16_256 0 "register_operand" "=Yw") + (vec_select:V16_256 + (vec_concat:<ssedoublevecmode> + (match_operand:V16_256 1 "register_operand" "Yw") + (match_operand:V16_256 2 "nonimmediate_operand" "Ywm")) (parallel [(const_int 0) (const_int 16) (const_int 1) (const_int 17) (const_int 2) (const_int 18) @@ -15475,12 +16138,12 @@ (set_attr "prefix" "maybe_evex") (set_attr "mode" "OI")]) -(define_insn "vec_interleave_lowv8hi<mask_name>" - [(set (match_operand:V8HI 0 "register_operand" "=x,Yw") - (vec_select:V8HI - (vec_concat:V16HI - (match_operand:V8HI 1 "register_operand" "0,Yw") - (match_operand:V8HI 2 "vector_operand" "xBm,Ywm")) +(define_insn "vec_interleave_low<mode><mask_name>" + [(set (match_operand:V8_128 0 "register_operand" "=x,Yw") + (vec_select:V8_128 + (vec_concat:<ssedoublevecmode> + (match_operand:V8_128 1 "register_operand" "0,Yw") + (match_operand:V8_128 2 "vector_operand" "xBm,Ywm")) (parallel [(const_int 0) (const_int 8) (const_int 1) (const_int 9) (const_int 2) (const_int 10) @@ -15655,6 +16318,7 @@ (V4SI "avx512dq") (V2DI "avx512dq")]) ;; sse4_1_pinsrd must come before sse2_loadld since it is preferred. +;; For V8HFmode and TARGET_AVX2, broadcastw + pblendw should be better. (define_insn "<sse2p4_1>_pinsr<ssemodesuffix>" [(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x,v,v") (vec_merge:PINSR_MODE @@ -15664,7 +16328,8 @@ (match_operand:SI 3 "const_int_operand")))] "TARGET_SSE2 && ((unsigned) exact_log2 (INTVAL (operands[3])) - < GET_MODE_NUNITS (<MODE>mode))" + < GET_MODE_NUNITS (<MODE>mode)) + && !(<MODE>mode == V8HFmode && TARGET_AVX2)" { operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3]))); @@ -15672,26 +16337,18 @@ { case 0: if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode)) - return "pinsr<ssemodesuffix>\t{%3, %k2, %0|%0, %k2, %3}"; + return "pinsr<sseintmodesuffix>\t{%3, %k2, %0|%0, %k2, %3}"; /* FALLTHRU */ case 1: - return "pinsr<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}"; + return "pinsr<sseintmodesuffix>\t{%3, %2, %0|%0, %2, %3}"; case 2: case 4: if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode)) - { - if (<MODE>mode == V8HFmode) - return "vpinsrw\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; - else - return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; - } + return "vpinsr<sseintmodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}"; /* FALLTHRU */ case 3: case 5: - if (<MODE>mode == V8HFmode) - return "vpinsrw\t{%3, %2, %1, %0|%0, %1, %2, %3}"; - else - return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + return "vpinsr<sseintmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"; default: gcc_unreachable (); } @@ -19179,11 +19836,14 @@ (lt:VI1_AVX2 (match_dup 3) (match_dup 4))] UNSPEC_BLENDV))] "operands[3] = gen_lowpart (<MODE>mode, operands[3]);") -(define_insn "sse4_1_pblendw" - [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x") - (vec_merge:V8HI - (match_operand:V8HI 2 "vector_operand" "YrBm,*xBm,xm") - (match_operand:V8HI 1 "register_operand" "0,0,x") +(define_mode_attr blendsuf + [(V8HI "w") (V8HF "ph")]) + +(define_insn "sse4_1_pblend<blendsuf>" + [(set (match_operand:V8_128 0 "register_operand" "=Yr,*x,x") + (vec_merge:V8_128 + (match_operand:V8_128 2 "vector_operand" "YrBm,*xBm,xm") + (match_operand:V8_128 1 "register_operand" "0,0,x") (match_operand:SI 3 "const_0_to_255_operand" "n,n,n")))] "TARGET_SSE4_1" "@ @@ -19210,6 +19870,47 @@ operands[3] = GEN_INT (val << 8 | val); }) +(define_expand "avx2_pblendph" + [(set (match_operand:V16HF 0 "register_operand") + (vec_merge:V16HF + (match_operand:V16HF 2 "register_operand") + (match_operand:V16HF 1 "register_operand") + (match_operand:SI 3 "const_int_operand")))] + "TARGET_AVX2 + && !((INTVAL (operands[3]) & 0xff) && (INTVAL (operands[3]) & 0xff00))" +{ + int mask = INTVAL (operands[3]); + if (mask == 0) + emit_move_insn (operands[0], operands[1]); + else + { + rtx tmp = gen_reg_rtx (V16HImode); + rtx blendw_idx, blendd_idx; + + if (mask & 0xff) + { + blendw_idx = GEN_INT (mask & 0xff); + blendd_idx = GEN_INT (15); + } + else + { + blendw_idx = GEN_INT (mask >> 8 & 0xff); + blendd_idx = GEN_INT (240); + } + operands[1] = lowpart_subreg (V16HImode, operands[1], V16HFmode); + operands[2] = lowpart_subreg (V16HImode, operands[2], V16HFmode); + emit_insn (gen_avx2_pblendw (tmp, operands[1], operands[2], blendw_idx)); + + operands[0] = lowpart_subreg (V8SImode, operands[0], V16HFmode); + tmp = lowpart_subreg (V8SImode, tmp, V16HImode); + operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode); + emit_insn (gen_avx2_pblenddv8si (operands[0], operands[1], + tmp, blendd_idx)); + } + + DONE; +}) + (define_insn "*avx2_pblendw" [(set (match_operand:V16HI 0 "register_operand" "=x") (vec_merge:V16HI @@ -24714,10 +25415,10 @@ (define_insn "avx512dq_fpclass<mode><mask_scalar_merge_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> - [(match_operand:VF_AVX512VL 1 "vector_operand" "vm") + [(match_operand:VFH_AVX512VL 1 "vector_operand" "vm") (match_operand 2 "const_0_to_255_operand" "n")] UNSPEC_FPCLASS))] - "TARGET_AVX512DQ" + "TARGET_AVX512DQ || VALID_AVX512FP16_REG_MODE(<MODE>mode)" "vfpclass<ssemodesuffix><vecmemsuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"; [(set_attr "type" "sse") (set_attr "length_immediate" "1") @@ -24728,11 +25429,11 @@ [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (and:<avx512fmaskmode> (unspec:<avx512fmaskmode> - [(match_operand:VF_128 1 "nonimmediate_operand" "vm") + [(match_operand:VFH_128 1 "nonimmediate_operand" "vm") (match_operand 2 "const_0_to_255_operand" "n")] UNSPEC_FPCLASS) (const_int 1)))] - "TARGET_AVX512DQ" + "TARGET_AVX512DQ || VALID_AVX512FP16_REG_MODE(<MODE>mode)" "vfpclass<ssescalarmodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"; [(set_attr "type" "sse") (set_attr "length_immediate" "1") @@ -24740,9 +25441,9 @@ (set_attr "mode" "<MODE>")]) (define_insn "<avx512>_getmant<mode><mask_name><round_saeonly_name>" - [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") - (unspec:VF_AVX512VL - [(match_operand:VF_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>") + [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v") + (unspec:VFH_AVX512VL + [(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "<round_saeonly_constraint>") (match_operand:SI 2 "const_0_to_15_operand")] UNSPEC_GETMANT))] "TARGET_AVX512F" @@ -24751,11 +25452,11 @@ (set_attr "mode" "<MODE>")]) (define_insn "avx512f_vgetmant<mode><mask_scalar_name><round_saeonly_scalar_name>" - [(set (match_operand:VF_128 0 "register_operand" "=v") - (vec_merge:VF_128 - (unspec:VF_128 - [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>") + [(set (match_operand:VFH_128 0 "register_operand" "=v") + (vec_merge:VFH_128 + (unspec:VFH_128 + [(match_operand:VFH_128 1 "register_operand" "v") + (match_operand:VFH_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>") (match_operand:SI 3 "const_0_to_15_operand")] UNSPEC_GETMANT) (match_dup 1) diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 717561a..157d49f 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -153,6 +153,7 @@ (define_subst_attr "round_mask_op4" "round" "" "<round_mask_operand4>") (define_subst_attr "round_sd_mask_op4" "round" "" "<round_sd_mask_operand4>") (define_subst_attr "round_constraint" "round" "vm" "v") +(define_subst_attr "round_qq2phsuff" "round" "<qq2phsuff>" "") (define_subst_attr "bcst_round_constraint" "round" "vmBr" "v") (define_subst_attr "round_constraint2" "round" "m" "v") (define_subst_attr "round_constraint3" "round" "rm" "r") diff --git a/gcc/config/i386/vxworks.h b/gcc/config/i386/vxworks.h index ebda7d9..0676cb4 100644 --- a/gcc/config/i386/vxworks.h +++ b/gcc/config/i386/vxworks.h @@ -73,37 +73,37 @@ along with GCC; see the file COPYING3. If not see VXWORKS_OS_CPP_BUILTINS (); \ if (TARGET_64BIT) \ VX_CPUDEF (X86_64); \ - else if (TARGET_PENTIUM4) \ + else if (TARGET_CPU_P (PENTIUM4)) \ { \ VX_CPUDEF (PENTIUM4); \ VX_CPUVDEF (PENTIUM4); \ } \ - else if (TARGET_CORE2) \ + else if (TARGET_CPU_P (CORE2)) \ VX_CPUDEF (CORE2); \ - else if (TARGET_NEHALEM) \ + else if (TARGET_CPU_P (NEHALEM)) \ VX_CPUDEF (NEHALEM); \ - else if (TARGET_SANDYBRIDGE) \ + else if (TARGET_CPU_P (SANDYBRIDGE)) \ VX_CPUDEF (SANDYBRIDGE); \ - else if (TARGET_HASWELL) \ + else if (TARGET_CPU_P (HASWELL)) \ VX_CPUDEF (HASWELL); \ - else if (TARGET_SILVERMONT) \ + else if (TARGET_CPU_P (SILVERMONT)) \ VX_CPUDEF (SILVERMONT); \ - else if (TARGET_SKYLAKE || TARGET_SKYLAKE_AVX512) \ + else if (TARGET_CPU_P (SKYLAKE) || TARGET_CPU_P (SKYLAKE_AVX512)) \ VX_CPUDEF (SKYLAKE); \ - else if (TARGET_GOLDMONT) \ + else if (TARGET_CPU_P (GOLDMONT)) \ VX_CPUDEF (GOLDMONT); \ else if (TARGET_VXWORKS7) \ VX_CPUDEF (PENTIUM4); \ - else if (TARGET_386) \ + else if (TARGET_CPU_P (I386)) \ VX_CPUDEF (I80386); \ - else if (TARGET_486) \ + else if (TARGET_CPU_P (I486)) \ VX_CPUDEF (I80486); \ - else if (TARGET_PENTIUM) \ + else if (TARGET_CPU_P (PENTIUM)) \ { \ VX_CPUDEF (PENTIUM); \ VX_CPUVDEF (PENTIUM); \ } \ - else if (TARGET_PENTIUMPRO) \ + else if (TARGET_CPU_P (PENTIUMPRO)) \ { \ VX_CPUDEF (PENTIUM2); \ VX_CPUVDEF (PENTIUMPRO); \ diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index ffe810f..93644be 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = { "16", /* Func alignment. */ }; +static stringop_algs tremont_memcpy[2] = { + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}, + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}}; +static stringop_algs tremont_memset[2] = { + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}, + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}}; +static const +struct processor_costs tremont_cost = { + { + /* Start of register allocator costs. integer->integer move cost is 2. */ + 6, /* cost for loading QImode using movzbl */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {6, 6, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 12}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode */ + {6, 6}, /* cost of storing MMX registers + in SImode and DImode */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ + {6, 6, 6, 10, 15}, /* cost of loading SSE registers + in 32,64,128,256 and 512-bit */ + {6, 6, 6, 10, 15}, /* cost of storing SSE registers + in 32,64,128,256 and 512-bit */ + 6, 6, /* SSE->integer and integer->SSE moves */ + 6, 6, /* mask->integer and integer->mask moves */ + {6, 6, 6}, /* cost of loading mask register + in QImode, HImode, SImode. */ + {6, 6, 6}, /* cost if storing mask register + in QImode, HImode, SImode. */ + 2, /* cost of moving mask register. */ + /* End of register allocator costs. */ + }, + + COSTS_N_INSNS (1), /* cost of an add instruction */ + /* Setting cost to 2 makes our current implementation of synth_mult result in + use of unnecessary temporary registers causing regression on several + SPECfp benchmarks. */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (4)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (22), /* HI */ + COSTS_N_INSNS (30), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 17, /* CLEAR_RATIO */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {6, 6, 6}, /* cost of storing integer registers */ + {6, 6, 6, 10, 15}, /* cost of loading SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {6, 6, 6, 10, 15}, /* cost of storing SSE register + in 32bit, 64bit, 128bit, 256bit and 512bit */ + {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ + {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ + 6, /* cost of moving SSE register to integer. */ + 18, 6, /* Gather load static, per_elt. */ + 18, 6, /* Gather store static, per_elt. */ + 32, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + /* Benchmarks shows large regressions on K8 sixtrack benchmark when this + value is increased to perhaps more appropriate value of 5. */ + 3, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (5), /* cost of FMUL instruction. */ + COSTS_N_INSNS (17), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ + + COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ + COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ + COSTS_N_INSNS (4), /* cost of MULSS instruction. */ + COSTS_N_INSNS (5), /* cost of MULSD instruction. */ + COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ + COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ + COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ + COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ + COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ + COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ + 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ + tremont_memcpy, + tremont_memset, + COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ + COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ + "16:11:8", /* Loop alignment. */ + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ +}; + static stringop_algs intel_memcpy[2] = { {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, diff --git a/gcc/config/i386/x86-tune-sched.c b/gcc/config/i386/x86-tune-sched.c index 2e5ee4e..56ada99 100644 --- a/gcc/config/i386/x86-tune-sched.c +++ b/gcc/config/i386/x86-tune-sched.c @@ -71,6 +71,7 @@ ix86_issue_rate (void) case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: case PROCESSOR_HASWELL: + case PROCESSOR_TREMONT: case PROCESSOR_GENERIC: return 4; @@ -429,6 +430,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: case PROCESSOR_HASWELL: + case PROCESSOR_TREMONT: case PROCESSOR_GENERIC: /* Stack engine allows to execute push&pop instructions in parall. */ if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 2f221b1..58e8ead 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -62,6 +62,21 @@ DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", that can be partly masked by careful scheduling of moves. */ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 + | m_BDVER | m_ZNVER | m_TREMONT | m_GENERIC) + +/* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids + partial write to the destination in scalar SSE conversion from FP + to FP. */ +DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY, + "sse_partial_reg_fp_converts_dependency", + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 + | m_BDVER | m_ZNVER | m_GENERIC) + +/* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial + write to the destination in scalar SSE conversion from integer to FP. */ +DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY, + "sse_partial_reg_converts_dependency", + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 | m_BDVER | m_ZNVER | m_GENERIC) /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies @@ -136,7 +151,7 @@ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL - | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ATHLON_K8) + | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8) /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are considered on critical path. */ @@ -150,14 +165,15 @@ DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move", /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */ DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", - m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) + m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_TREMONT + | m_GENERIC) /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions. Some chips, like 486 and Pentium works faster with separate load and push instructions. */ DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE - | m_GENERIC) + | m_TREMONT | m_GENERIC) /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred over esp subtraction. */ @@ -198,8 +214,7 @@ DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", than 4 branch instructions in the 16 byte window. */ DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM - | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL | m_ATHLON_K8 - | m_AMDFAM10) + | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10) /*****************************************************************************/ /* Integer instruction selection tuning */ @@ -240,11 +255,11 @@ DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves", /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag will impact LEA instruction selection. */ DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL - | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL) + | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL) /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */ DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr", - m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT + m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL | m_KNM) /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is @@ -263,7 +278,7 @@ DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8", a conditional move. */ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove", m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL - | m_KNM | m_TREMONT | m_INTEL) + | m_KNM | m_INTEL) /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */ @@ -273,7 +288,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) move/set sequences of bytes with known size. */ DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB, "prefer_known_rep_movsb_stosb", - m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512) + m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512) /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of compact prologues and epilogues by issuing a misaligned moves. This @@ -282,7 +297,8 @@ DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB, FIXME: This may actualy be a win on more targets than listed here. */ DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES, "misaligned_move_string_pro_epilogues", - m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC) + m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_TREMONT + | m_GENERIC) /* X86_TUNE_USE_SAHF: Controls use of SAHF. */ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", @@ -294,7 +310,7 @@ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL - | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT)) + | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS)) /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ DEF_TUNE (X86_TUNE_USE_BT, "use_bt", @@ -305,7 +321,7 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt", /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency for bit-manipulation instructions. */ DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", - m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC) + m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_GENERIC) /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based on hardware capabilities. Bdver3 hardware has a loop buffer which makes @@ -321,14 +337,14 @@ DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn", /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence. */ DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence", - m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC) + m_CORE_ALL | m_BDVER | m_ZNVER | m_TREMONT | m_GENERIC) /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) - (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions. */ DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs", m_CORE_ALL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT - | m_GOLDMONT_PLUS | m_TREMONT ) + | m_GOLDMONT_PLUS) /*****************************************************************************/ /* 387 instruction selection tuning */ @@ -386,13 +402,13 @@ DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optim /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */ DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", - m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC) + m_AMD_MULTIPLE | m_CORE_ALL | m_TREMONT | m_GENERIC) /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to xorps/xorpd and other variants. */ DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER - | m_GENERIC) + | m_TREMONT | m_GENERIC) /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer to SSE registers. If disabled, the moves will be done by storing @@ -419,7 +435,7 @@ DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions", fp converts to destination register. */ DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts", m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS - | m_TREMONT | m_INTEL) + | m_INTEL) /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion from FP to FP. This form of instructions avoids partial write to the @@ -434,7 +450,7 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */ DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb", m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT - | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL) + | m_GOLDMONT_PLUS | m_INTEL) /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", diff --git a/gcc/config/mips/netbsd.h b/gcc/config/mips/netbsd.h index 85c2779..1c6a59d 100644 --- a/gcc/config/mips/netbsd.h +++ b/gcc/config/mips/netbsd.h @@ -87,7 +87,7 @@ along with GCC; see the file COPYING3. If not see else if (mips_isa >= MIPS_ISA_MIPS32 \ && mips_isa < MIPS_ISA_MIPS64) \ builtin_define ("__mips=32"); \ - else if (mips_isa >= MIPS_ISA_64) \ + else if (mips_isa >= MIPS_ISA_MIPS64) \ builtin_define ("__mips=64"); \ if (mips_isa_rev > 0) \ builtin_define_with_int_value ("__mips_isa_rev", \ diff --git a/gcc/config/rs6000/lynx.h b/gcc/config/rs6000/lynx.h index 3434c8b..0ddb54f 100644 --- a/gcc/config/rs6000/lynx.h +++ b/gcc/config/rs6000/lynx.h @@ -80,7 +80,6 @@ #undef SIZE_TYPE #undef ASM_OUTPUT_ALIGN -#undef PREFERRED_DEBUGGING_TYPE /* The file rs6000.c defines TARGET_HAVE_TLS unconditionally to the value of HAVE_AS_TLS. HAVE_AS_TLS is true as gas support for TLS diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md index 1f6fc03..1990a21 100644 --- a/gcc/config/rs6000/mma.md +++ b/gcc/config/rs6000/mma.md @@ -91,7 +91,10 @@ UNSPEC_MMA_XVI8GER4SPP UNSPEC_MMA_XXMFACC UNSPEC_MMA_XXMTACC - UNSPEC_MMA_XXSETACCZ + ]) + +(define_c_enum "unspecv" + [UNSPECV_MMA_XXSETACCZ ]) ;; MMA instructions with 1 accumulator argument @@ -467,30 +470,16 @@ "<acc> %A0" [(set_attr "type" "mma")]) -;; We can't have integer constants in XOmode so we wrap this in an UNSPEC. - -(define_expand "mma_xxsetaccz" - [(set (match_operand:XO 0 "fpr_reg_operand") - (const_int 0))] - "TARGET_MMA" -{ - rtx xo0 = gen_rtx_UNSPEC (XOmode, gen_rtvec (1, const0_rtx), - UNSPEC_MMA_XXSETACCZ); - emit_insn (gen_rtx_SET (operands[0], xo0)); - DONE; -}) +;; We can't have integer constants in XOmode so we wrap this in an +;; UNSPEC_VOLATILE. -(define_insn_and_split "*mma_xxsetaccz" +(define_insn "mma_xxsetaccz" [(set (match_operand:XO 0 "fpr_reg_operand" "=d") - (unspec:XO [(match_operand 1 "const_0_to_1_operand" "O")] - UNSPEC_MMA_XXSETACCZ))] + (unspec_volatile:XO [(const_int 0)] + UNSPECV_MMA_XXSETACCZ))] "TARGET_MMA" "xxsetaccz %A0" - "&& reload_completed" - [(set (match_dup 0) (unspec:XO [(match_dup 1)] UNSPEC_MMA_XXSETACCZ))] - "" - [(set_attr "type" "mma") - (set_attr "length" "4")]) + [(set_attr "type" "mma")]) (define_insn "mma_<vv>" [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def index 6a28d51..a8c6b9e 100644 --- a/gcc/config/rs6000/rs6000-builtin-new.def +++ b/gcc/config/rs6000/rs6000-builtin-new.def @@ -208,6 +208,12 @@ double __builtin_mffs (); MFFS rs6000_mffs {} +; Although the mffsl instruction is only available on POWER9 and later +; processors, this builtin automatically falls back to mffs on older +; platforms. Thus it appears here in the [always] stanza. + double __builtin_mffsl (); + MFFSL rs6000_mffsl {} + ; This thing really assumes long double == __ibm128, and I'm told it has ; been used as such within libgcc. Given that __builtin_pack_ibm128 ; exists for the same purpose, this should really not be used at all. @@ -2784,9 +2790,6 @@ signed long long __builtin_darn_raw (); DARN_RAW darn_raw {} - double __builtin_mffsl (); - MFFSL rs6000_mffsl {} - const signed int __builtin_dtstsfi_eq_dd (const int<6>, _Decimal64); TSTSFI_EQ_DD dfptstsfi_eq_dd {} diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index afcb5bb..d08bdfe 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -35,6 +35,9 @@ #include "langhooks.h" #include "c/c-tree.h" +#include "rs6000-builtins.h" + +static tree altivec_resolve_new_overloaded_builtin (location_t, tree, void *); /* Handle the machine specific pragma longcall. Its syntax is @@ -811,6 +814,32 @@ is_float128_p (tree t) && t == long_double_type_node)); } + +/* Return true iff ARGTYPE can be compatibly passed as PARMTYPE. */ +static bool +rs6000_new_builtin_type_compatible (tree parmtype, tree argtype) +{ + if (parmtype == error_mark_node) + return false; + + if (INTEGRAL_TYPE_P (parmtype) && INTEGRAL_TYPE_P (argtype)) + return true; + + if (TARGET_IEEEQUAD && TARGET_LONG_DOUBLE_128 + && is_float128_p (parmtype) && is_float128_p (argtype)) + return true; + + if (POINTER_TYPE_P (parmtype) && POINTER_TYPE_P (argtype)) + { + parmtype = TREE_TYPE (parmtype); + argtype = TREE_TYPE (argtype); + if (TYPE_READONLY (argtype)) + parmtype = build_qualified_type (parmtype, TYPE_QUAL_CONST); + } + + return lang_hooks.types_compatible_p (parmtype, argtype); +} + static inline bool rs6000_builtin_type_compatible (tree t, int id) { @@ -927,6 +956,10 @@ tree altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, void *passed_arglist) { + if (new_builtins_are_live) + return altivec_resolve_new_overloaded_builtin (loc, fndecl, + passed_arglist); + vec<tree, va_gc> *arglist = static_cast<vec<tree, va_gc> *> (passed_arglist); unsigned int nargs = vec_safe_length (arglist); enum rs6000_builtins fcode @@ -1930,3 +1963,1048 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, return error_mark_node; } } + +/* Build a tree for a function call to an Altivec non-overloaded builtin. + The overloaded builtin that matched the types and args is described + by DESC. The N arguments are given in ARGS, respectively. + + Actually the only thing it does is calling fold_convert on ARGS, with + a small exception for vec_{all,any}_{ge,le} predicates. */ + +static tree +altivec_build_new_resolved_builtin (tree *args, int n, tree fntype, + tree ret_type, + rs6000_gen_builtins bif_id, + rs6000_gen_builtins ovld_id) +{ + tree argtypes = TYPE_ARG_TYPES (fntype); + tree arg_type[MAX_OVLD_ARGS]; + tree fndecl = rs6000_builtin_decls_x[bif_id]; + + for (int i = 0; i < n; i++) + { + arg_type[i] = TREE_VALUE (argtypes); + argtypes = TREE_CHAIN (argtypes); + } + + /* The AltiVec overloading implementation is overall gross, but this + is particularly disgusting. The vec_{all,any}_{ge,le} builtins + are completely different for floating-point vs. integer vector + types, because the former has vcmpgefp, but the latter should use + vcmpgtXX. + + In practice, the second and third arguments are swapped, and the + condition (LT vs. EQ, which is recognizable by bit 1 of the first + argument) is reversed. Patch the arguments here before building + the resolved CALL_EXPR. */ + if (n == 3 + && ovld_id == RS6000_OVLD_VEC_CMPGE_P + && bif_id != RS6000_BIF_VCMPGEFP_P + && bif_id != RS6000_BIF_XVCMPGEDP_P) + { + std::swap (args[1], args[2]); + std::swap (arg_type[1], arg_type[2]); + + args[0] = fold_build2 (BIT_XOR_EXPR, TREE_TYPE (args[0]), args[0], + build_int_cst (NULL_TREE, 2)); + } + + for (int j = 0; j < n; j++) + args[j] = fully_fold_convert (arg_type[j], args[j]); + + /* If the number of arguments to an overloaded function increases, + we must expand this switch. */ + gcc_assert (MAX_OVLD_ARGS <= 4); + + tree call; + switch (n) + { + case 0: + call = build_call_expr (fndecl, 0); + break; + case 1: + call = build_call_expr (fndecl, 1, args[0]); + break; + case 2: + call = build_call_expr (fndecl, 2, args[0], args[1]); + break; + case 3: + call = build_call_expr (fndecl, 3, args[0], args[1], args[2]); + break; + case 4: + call = build_call_expr (fndecl, 4, args[0], args[1], args[2], args[3]); + break; + default: + gcc_unreachable (); + } + return fold_convert (ret_type, call); +} + +/* Implementation of the resolve_overloaded_builtin target hook, to + support Altivec's overloaded builtins. FIXME: This code needs + to be brutally factored. */ + +static tree +altivec_resolve_new_overloaded_builtin (location_t loc, tree fndecl, + void *passed_arglist) +{ + vec<tree, va_gc> *arglist = static_cast<vec<tree, va_gc> *> (passed_arglist); + unsigned int nargs = vec_safe_length (arglist); + enum rs6000_gen_builtins fcode + = (enum rs6000_gen_builtins) DECL_MD_FUNCTION_CODE (fndecl); + tree fnargs = TYPE_ARG_TYPES (TREE_TYPE (fndecl)); + tree types[MAX_OVLD_ARGS]; + tree args[MAX_OVLD_ARGS]; + + /* Return immediately if this isn't an overload. */ + if (fcode <= RS6000_OVLD_NONE) + return NULL_TREE; + + unsigned int adj_fcode = fcode - RS6000_OVLD_NONE; + + if (TARGET_DEBUG_BUILTIN) + fprintf (stderr, "altivec_resolve_overloaded_builtin, code = %4d, %s\n", + (int) fcode, IDENTIFIER_POINTER (DECL_NAME (fndecl))); + + /* vec_lvsl and vec_lvsr are deprecated for use with LE element order. */ + if (fcode == RS6000_OVLD_VEC_LVSL && !BYTES_BIG_ENDIAN) + warning (OPT_Wdeprecated, + "%<vec_lvsl%> is deprecated for little endian; use " + "assignment for unaligned loads and stores"); + else if (fcode == RS6000_OVLD_VEC_LVSR && !BYTES_BIG_ENDIAN) + warning (OPT_Wdeprecated, + "%<vec_lvsr%> is deprecated for little endian; use " + "assignment for unaligned loads and stores"); + + if (fcode == RS6000_OVLD_VEC_MUL) + { + /* vec_mul needs to be special cased because there are no instructions + for it for the {un}signed char, {un}signed short, and {un}signed int + types. */ + if (nargs != 2) + { + error ("builtin %qs only accepts 2 arguments", "vec_mul"); + return error_mark_node; + } + + tree arg0 = (*arglist)[0]; + tree arg0_type = TREE_TYPE (arg0); + tree arg1 = (*arglist)[1]; + tree arg1_type = TREE_TYPE (arg1); + + /* Both arguments must be vectors and the types must be compatible. */ + if (TREE_CODE (arg0_type) != VECTOR_TYPE) + goto bad; + if (!lang_hooks.types_compatible_p (arg0_type, arg1_type)) + goto bad; + + switch (TYPE_MODE (TREE_TYPE (arg0_type))) + { + case E_QImode: + case E_HImode: + case E_SImode: + case E_DImode: + case E_TImode: + { + /* For scalar types just use a multiply expression. */ + return fold_build2_loc (loc, MULT_EXPR, TREE_TYPE (arg0), arg0, + fold_convert (TREE_TYPE (arg0), arg1)); + } + case E_SFmode: + { + /* For floats use the xvmulsp instruction directly. */ + tree call = rs6000_builtin_decls_x[RS6000_BIF_XVMULSP]; + return build_call_expr (call, 2, arg0, arg1); + } + case E_DFmode: + { + /* For doubles use the xvmuldp instruction directly. */ + tree call = rs6000_builtin_decls_x[RS6000_BIF_XVMULDP]; + return build_call_expr (call, 2, arg0, arg1); + } + /* Other types are errors. */ + default: + goto bad; + } + } + + if (fcode == RS6000_OVLD_VEC_CMPNE) + { + /* vec_cmpne needs to be special cased because there are no instructions + for it (prior to power 9). */ + if (nargs != 2) + { + error ("builtin %qs only accepts 2 arguments", "vec_cmpne"); + return error_mark_node; + } + + tree arg0 = (*arglist)[0]; + tree arg0_type = TREE_TYPE (arg0); + tree arg1 = (*arglist)[1]; + tree arg1_type = TREE_TYPE (arg1); + + /* Both arguments must be vectors and the types must be compatible. */ + if (TREE_CODE (arg0_type) != VECTOR_TYPE) + goto bad; + if (!lang_hooks.types_compatible_p (arg0_type, arg1_type)) + goto bad; + + /* Power9 instructions provide the most efficient implementation of + ALTIVEC_BUILTIN_VEC_CMPNE if the mode is not DImode or TImode + or SFmode or DFmode. */ + if (!TARGET_P9_VECTOR + || (TYPE_MODE (TREE_TYPE (arg0_type)) == DImode) + || (TYPE_MODE (TREE_TYPE (arg0_type)) == TImode) + || (TYPE_MODE (TREE_TYPE (arg0_type)) == SFmode) + || (TYPE_MODE (TREE_TYPE (arg0_type)) == DFmode)) + { + switch (TYPE_MODE (TREE_TYPE (arg0_type))) + { + /* vec_cmpneq (va, vb) == vec_nor (vec_cmpeq (va, vb), + vec_cmpeq (va, vb)). */ + /* Note: vec_nand also works but opt changes vec_nand's + to vec_nor's anyway. */ + case E_QImode: + case E_HImode: + case E_SImode: + case E_DImode: + case E_TImode: + case E_SFmode: + case E_DFmode: + { + /* call = vec_cmpeq (va, vb) + result = vec_nor (call, call). */ + vec<tree, va_gc> *params = make_tree_vector (); + vec_safe_push (params, arg0); + vec_safe_push (params, arg1); + tree call = altivec_resolve_new_overloaded_builtin + (loc, rs6000_builtin_decls_x[RS6000_OVLD_VEC_CMPEQ], + params); + /* Use save_expr to ensure that operands used more than once + that may have side effects (like calls) are only evaluated + once. */ + call = save_expr (call); + params = make_tree_vector (); + vec_safe_push (params, call); + vec_safe_push (params, call); + return altivec_resolve_new_overloaded_builtin + (loc, rs6000_builtin_decls_x[RS6000_OVLD_VEC_NOR], params); + } + /* Other types are errors. */ + default: + goto bad; + } + } + /* else, fall through and process the Power9 alternative below */ + } + + if (fcode == RS6000_OVLD_VEC_ADDE || fcode == RS6000_OVLD_VEC_SUBE) + { + /* vec_adde needs to be special cased because there is no instruction + for the {un}signed int version. */ + if (nargs != 3) + { + const char *name; + name = fcode == RS6000_OVLD_VEC_ADDE ? "vec_adde" : "vec_sube"; + error ("builtin %qs only accepts 3 arguments", name); + return error_mark_node; + } + + tree arg0 = (*arglist)[0]; + tree arg0_type = TREE_TYPE (arg0); + tree arg1 = (*arglist)[1]; + tree arg1_type = TREE_TYPE (arg1); + tree arg2 = (*arglist)[2]; + tree arg2_type = TREE_TYPE (arg2); + + /* All 3 arguments must be vectors of (signed or unsigned) (int or + __int128) and the types must be compatible. */ + if (TREE_CODE (arg0_type) != VECTOR_TYPE) + goto bad; + if (!lang_hooks.types_compatible_p (arg0_type, arg1_type) + || !lang_hooks.types_compatible_p (arg1_type, arg2_type)) + goto bad; + + switch (TYPE_MODE (TREE_TYPE (arg0_type))) + { + /* For {un}signed ints, + vec_adde (va, vb, carryv) == vec_add (vec_add (va, vb), + vec_and (carryv, 1)). + vec_sube (va, vb, carryv) == vec_sub (vec_sub (va, vb), + vec_and (carryv, 1)). */ + case E_SImode: + { + tree add_sub_builtin; + + vec<tree, va_gc> *params = make_tree_vector (); + vec_safe_push (params, arg0); + vec_safe_push (params, arg1); + + if (fcode == RS6000_OVLD_VEC_ADDE) + add_sub_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_ADD]; + else + add_sub_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_SUB]; + + tree call + = altivec_resolve_new_overloaded_builtin (loc, + add_sub_builtin, + params); + tree const1 = build_int_cstu (TREE_TYPE (arg0_type), 1); + tree ones_vector = build_vector_from_val (arg0_type, const1); + tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, arg0_type, + arg2, ones_vector); + params = make_tree_vector (); + vec_safe_push (params, call); + vec_safe_push (params, and_expr); + return altivec_resolve_new_overloaded_builtin (loc, + add_sub_builtin, + params); + } + /* For {un}signed __int128s use the vaddeuqm/vsubeuqm instruction + directly. */ + case E_TImode: + break; + + /* Types other than {un}signed int and {un}signed __int128 + are errors. */ + default: + goto bad; + } + } + + if (fcode == RS6000_OVLD_VEC_ADDEC || fcode == RS6000_OVLD_VEC_SUBEC) + { + /* vec_addec and vec_subec needs to be special cased because there is + no instruction for the {un}signed int version. */ + if (nargs != 3) + { + const char *name; + name = fcode == RS6000_OVLD_VEC_ADDEC ? "vec_addec" : "vec_subec"; + error ("builtin %qs only accepts 3 arguments", name); + return error_mark_node; + } + + tree arg0 = (*arglist)[0]; + tree arg0_type = TREE_TYPE (arg0); + tree arg1 = (*arglist)[1]; + tree arg1_type = TREE_TYPE (arg1); + tree arg2 = (*arglist)[2]; + tree arg2_type = TREE_TYPE (arg2); + + /* All 3 arguments must be vectors of (signed or unsigned) (int or + __int128) and the types must be compatible. */ + if (TREE_CODE (arg0_type) != VECTOR_TYPE) + goto bad; + if (!lang_hooks.types_compatible_p (arg0_type, arg1_type) + || !lang_hooks.types_compatible_p (arg1_type, arg2_type)) + goto bad; + + switch (TYPE_MODE (TREE_TYPE (arg0_type))) + { + /* For {un}signed ints, + vec_addec (va, vb, carryv) == + vec_or (vec_addc (va, vb), + vec_addc (vec_add (va, vb), + vec_and (carryv, 0x1))). */ + case E_SImode: + { + /* Use save_expr to ensure that operands used more than once + that may have side effects (like calls) are only evaluated + once. */ + tree as_builtin; + tree as_c_builtin; + + arg0 = save_expr (arg0); + arg1 = save_expr (arg1); + vec<tree, va_gc> *params = make_tree_vector (); + vec_safe_push (params, arg0); + vec_safe_push (params, arg1); + + if (fcode == RS6000_OVLD_VEC_ADDEC) + as_c_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_ADDC]; + else + as_c_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_SUBC]; + + tree call1 = altivec_resolve_new_overloaded_builtin (loc, + as_c_builtin, + params); + params = make_tree_vector (); + vec_safe_push (params, arg0); + vec_safe_push (params, arg1); + + if (fcode == RS6000_OVLD_VEC_ADDEC) + as_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_ADD]; + else + as_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_SUB]; + + tree call2 = altivec_resolve_new_overloaded_builtin (loc, + as_builtin, + params); + tree const1 = build_int_cstu (TREE_TYPE (arg0_type), 1); + tree ones_vector = build_vector_from_val (arg0_type, const1); + tree and_expr = fold_build2_loc (loc, BIT_AND_EXPR, arg0_type, + arg2, ones_vector); + params = make_tree_vector (); + vec_safe_push (params, call2); + vec_safe_push (params, and_expr); + call2 = altivec_resolve_new_overloaded_builtin (loc, as_c_builtin, + params); + params = make_tree_vector (); + vec_safe_push (params, call1); + vec_safe_push (params, call2); + tree or_builtin = rs6000_builtin_decls_x[RS6000_OVLD_VEC_OR]; + return altivec_resolve_new_overloaded_builtin (loc, or_builtin, + params); + } + /* For {un}signed __int128s use the vaddecuq/vsubbecuq + instructions. This occurs through normal processing. */ + case E_TImode: + break; + + /* Types other than {un}signed int and {un}signed __int128 + are errors. */ + default: + goto bad; + } + } + + /* For now treat vec_splats and vec_promote as the same. */ + if (fcode == RS6000_OVLD_VEC_SPLATS || fcode == RS6000_OVLD_VEC_PROMOTE) + { + tree type, arg; + int size; + int i; + bool unsigned_p; + vec<constructor_elt, va_gc> *vec; + const char *name; + name = fcode == RS6000_OVLD_VEC_SPLATS ? "vec_splats" : "vec_promote"; + + if (fcode == RS6000_OVLD_VEC_SPLATS && nargs != 1) + { + error ("builtin %qs only accepts 1 argument", name); + return error_mark_node; + } + if (fcode == RS6000_OVLD_VEC_PROMOTE && nargs != 2) + { + error ("builtin %qs only accepts 2 arguments", name); + return error_mark_node; + } + /* Ignore promote's element argument. */ + if (fcode == RS6000_OVLD_VEC_PROMOTE + && !INTEGRAL_TYPE_P (TREE_TYPE ((*arglist)[1]))) + goto bad; + + arg = (*arglist)[0]; + type = TREE_TYPE (arg); + if (!SCALAR_FLOAT_TYPE_P (type) + && !INTEGRAL_TYPE_P (type)) + goto bad; + unsigned_p = TYPE_UNSIGNED (type); + switch (TYPE_MODE (type)) + { + case E_TImode: + type = unsigned_p ? unsigned_V1TI_type_node : V1TI_type_node; + size = 1; + break; + case E_DImode: + type = unsigned_p ? unsigned_V2DI_type_node : V2DI_type_node; + size = 2; + break; + case E_SImode: + type = unsigned_p ? unsigned_V4SI_type_node : V4SI_type_node; + size = 4; + break; + case E_HImode: + type = unsigned_p ? unsigned_V8HI_type_node : V8HI_type_node; + size = 8; + break; + case E_QImode: + type = unsigned_p ? unsigned_V16QI_type_node : V16QI_type_node; + size = 16; + break; + case E_SFmode: + type = V4SF_type_node; + size = 4; + break; + case E_DFmode: + type = V2DF_type_node; + size = 2; + break; + default: + goto bad; + } + arg = save_expr (fold_convert (TREE_TYPE (type), arg)); + vec_alloc (vec, size); + for (i = 0; i < size; i++) + { + constructor_elt elt = {NULL_TREE, arg}; + vec->quick_push (elt); + } + return build_constructor (type, vec); + } + + /* For now use pointer tricks to do the extraction, unless we are on VSX + extracting a double from a constant offset. */ + if (fcode == RS6000_OVLD_VEC_EXTRACT) + { + tree arg1; + tree arg1_type; + tree arg2; + tree arg1_inner_type; + tree decl, stmt; + tree innerptrtype; + machine_mode mode; + + /* No second argument. */ + if (nargs != 2) + { + error ("builtin %qs only accepts 2 arguments", "vec_extract"); + return error_mark_node; + } + + arg2 = (*arglist)[1]; + arg1 = (*arglist)[0]; + arg1_type = TREE_TYPE (arg1); + + if (TREE_CODE (arg1_type) != VECTOR_TYPE) + goto bad; + if (!INTEGRAL_TYPE_P (TREE_TYPE (arg2))) + goto bad; + + /* See if we can optimize vec_extracts with the current VSX instruction + set. */ + mode = TYPE_MODE (arg1_type); + if (VECTOR_MEM_VSX_P (mode)) + + { + tree call = NULL_TREE; + int nunits = GET_MODE_NUNITS (mode); + + arg2 = fold_for_warn (arg2); + + /* If the second argument is an integer constant, generate + the built-in code if we can. We need 64-bit and direct + move to extract the small integer vectors. */ + if (TREE_CODE (arg2) == INTEGER_CST) + { + wide_int selector = wi::to_wide (arg2); + selector = wi::umod_trunc (selector, nunits); + arg2 = wide_int_to_tree (TREE_TYPE (arg2), selector); + switch (mode) + { + default: + break; + + case E_V1TImode: + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V1TI]; + break; + + case E_V2DFmode: + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DF]; + break; + + case E_V2DImode: + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DI]; + break; + + case E_V4SFmode: + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SF]; + break; + + case E_V4SImode: + if (TARGET_DIRECT_MOVE_64BIT) + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SI]; + break; + + case E_V8HImode: + if (TARGET_DIRECT_MOVE_64BIT) + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V8HI]; + break; + + case E_V16QImode: + if (TARGET_DIRECT_MOVE_64BIT) + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V16QI]; + break; + } + } + + /* If the second argument is variable, we can optimize it if we are + generating 64-bit code on a machine with direct move. */ + else if (TREE_CODE (arg2) != INTEGER_CST && TARGET_DIRECT_MOVE_64BIT) + { + switch (mode) + { + default: + break; + + case E_V2DFmode: + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DF]; + break; + + case E_V2DImode: + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V2DI]; + break; + + case E_V4SFmode: + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SF]; + break; + + case E_V4SImode: + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V4SI]; + break; + + case E_V8HImode: + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V8HI]; + break; + + case E_V16QImode: + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_EXT_V16QI]; + break; + } + } + + if (call) + { + tree result = build_call_expr (call, 2, arg1, arg2); + /* Coerce the result to vector element type. May be no-op. */ + arg1_inner_type = TREE_TYPE (arg1_type); + result = fold_convert (arg1_inner_type, result); + return result; + } + } + + /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2). */ + arg1_inner_type = TREE_TYPE (arg1_type); + tree subp = build_int_cst (TREE_TYPE (arg2), + TYPE_VECTOR_SUBPARTS (arg1_type) - 1); + arg2 = build_binary_op (loc, BIT_AND_EXPR, arg2, subp, 0); + decl = build_decl (loc, VAR_DECL, NULL_TREE, arg1_type); + DECL_EXTERNAL (decl) = 0; + TREE_PUBLIC (decl) = 0; + DECL_CONTEXT (decl) = current_function_decl; + TREE_USED (decl) = 1; + TREE_TYPE (decl) = arg1_type; + TREE_READONLY (decl) = TYPE_READONLY (arg1_type); + if (c_dialect_cxx ()) + { + stmt = build4 (TARGET_EXPR, arg1_type, decl, arg1, + NULL_TREE, NULL_TREE); + SET_EXPR_LOCATION (stmt, loc); + } + else + { + DECL_INITIAL (decl) = arg1; + stmt = build1 (DECL_EXPR, arg1_type, decl); + TREE_ADDRESSABLE (decl) = 1; + SET_EXPR_LOCATION (stmt, loc); + stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); + } + + innerptrtype = build_pointer_type (arg1_inner_type); + + stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0); + stmt = convert (innerptrtype, stmt); + stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1); + stmt = build_indirect_ref (loc, stmt, RO_NULL); + + /* PR83660: We mark this as having side effects so that + downstream in fold_build_cleanup_point_expr () it will get a + CLEANUP_POINT_EXPR. If it does not we can run into an ICE + later in gimplify_cleanup_point_expr (). Potentially this + causes missed optimization because there actually is no side + effect. */ + if (c_dialect_cxx ()) + TREE_SIDE_EFFECTS (stmt) = 1; + + return stmt; + } + + /* For now use pointer tricks to do the insertion, unless we are on VSX + inserting a double to a constant offset. */ + if (fcode == RS6000_OVLD_VEC_INSERT) + { + tree arg0; + tree arg1; + tree arg2; + tree arg1_type; + tree decl, stmt; + machine_mode mode; + + /* No second or third arguments. */ + if (nargs != 3) + { + error ("builtin %qs only accepts 3 arguments", "vec_insert"); + return error_mark_node; + } + + arg0 = (*arglist)[0]; + arg1 = (*arglist)[1]; + arg1_type = TREE_TYPE (arg1); + arg2 = fold_for_warn ((*arglist)[2]); + + if (TREE_CODE (arg1_type) != VECTOR_TYPE) + goto bad; + if (!INTEGRAL_TYPE_P (TREE_TYPE (arg2))) + goto bad; + + /* If we can use the VSX xxpermdi instruction, use that for insert. */ + mode = TYPE_MODE (arg1_type); + if ((mode == V2DFmode || mode == V2DImode) && VECTOR_UNIT_VSX_P (mode) + && TREE_CODE (arg2) == INTEGER_CST) + { + wide_int selector = wi::to_wide (arg2); + selector = wi::umod_trunc (selector, 2); + tree call = NULL_TREE; + + arg2 = wide_int_to_tree (TREE_TYPE (arg2), selector); + if (mode == V2DFmode) + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_SET_V2DF]; + else if (mode == V2DImode) + call = rs6000_builtin_decls_x[RS6000_BIF_VEC_SET_V2DI]; + + /* Note, __builtin_vec_insert_<xxx> has vector and scalar types + reversed. */ + if (call) + return build_call_expr (call, 3, arg1, arg0, arg2); + } + else if (mode == V1TImode && VECTOR_UNIT_VSX_P (mode) + && TREE_CODE (arg2) == INTEGER_CST) + { + tree call = rs6000_builtin_decls_x[RS6000_BIF_VEC_SET_V1TI]; + wide_int selector = wi::zero(32); + + arg2 = wide_int_to_tree (TREE_TYPE (arg2), selector); + /* Note, __builtin_vec_insert_<xxx> has vector and scalar types + reversed. */ + return build_call_expr (call, 3, arg1, arg0, arg2); + } + + /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0 with + VIEW_CONVERT_EXPR. i.e.: + D.3192 = v1; + _1 = n & 3; + VIEW_CONVERT_EXPR<int[4]>(D.3192)[_1] = i; + v1 = D.3192; + D.3194 = v1; */ + if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) + arg2 = build_int_cst (TREE_TYPE (arg2), 0); + else + arg2 = build_binary_op (loc, BIT_AND_EXPR, arg2, + build_int_cst (TREE_TYPE (arg2), + TYPE_VECTOR_SUBPARTS (arg1_type) + - 1), 0); + decl = build_decl (loc, VAR_DECL, NULL_TREE, arg1_type); + DECL_EXTERNAL (decl) = 0; + TREE_PUBLIC (decl) = 0; + DECL_CONTEXT (decl) = current_function_decl; + TREE_USED (decl) = 1; + TREE_TYPE (decl) = arg1_type; + TREE_READONLY (decl) = TYPE_READONLY (arg1_type); + TREE_ADDRESSABLE (decl) = 1; + if (c_dialect_cxx ()) + { + stmt = build4 (TARGET_EXPR, arg1_type, decl, arg1, + NULL_TREE, NULL_TREE); + SET_EXPR_LOCATION (stmt, loc); + } + else + { + DECL_INITIAL (decl) = arg1; + stmt = build1 (DECL_EXPR, arg1_type, decl); + SET_EXPR_LOCATION (stmt, loc); + stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); + } + + if (TARGET_VSX) + { + stmt = build_array_ref (loc, stmt, arg2); + stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, + convert (TREE_TYPE (stmt), arg0)); + stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); + } + else + { + tree arg1_inner_type; + tree innerptrtype; + arg1_inner_type = TREE_TYPE (arg1_type); + innerptrtype = build_pointer_type (arg1_inner_type); + + stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0); + stmt = convert (innerptrtype, stmt); + stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1); + stmt = build_indirect_ref (loc, stmt, RO_NULL); + stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt, + convert (TREE_TYPE (stmt), arg0)); + stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); + } + return stmt; + } + + unsigned int n; + for (n = 0; + !VOID_TYPE_P (TREE_VALUE (fnargs)) && n < nargs; + fnargs = TREE_CHAIN (fnargs), n++) + { + tree decl_type = TREE_VALUE (fnargs); + tree arg = (*arglist)[n]; + tree type; + + if (arg == error_mark_node) + return error_mark_node; + + if (n >= MAX_OVLD_ARGS) + abort (); + + arg = default_conversion (arg); + + /* The C++ front-end converts float * to const void * using + NOP_EXPR<const void *> (NOP_EXPR<void *> (x)). */ + type = TREE_TYPE (arg); + if (POINTER_TYPE_P (type) + && TREE_CODE (arg) == NOP_EXPR + && lang_hooks.types_compatible_p (TREE_TYPE (arg), + const_ptr_type_node) + && lang_hooks.types_compatible_p (TREE_TYPE (TREE_OPERAND (arg, 0)), + ptr_type_node)) + { + arg = TREE_OPERAND (arg, 0); + type = TREE_TYPE (arg); + } + + /* Remove the const from the pointers to simplify the overload + matching further down. */ + if (POINTER_TYPE_P (decl_type) + && POINTER_TYPE_P (type) + && TYPE_QUALS (TREE_TYPE (type)) != 0) + { + if (TYPE_READONLY (TREE_TYPE (type)) + && !TYPE_READONLY (TREE_TYPE (decl_type))) + warning (0, "passing argument %d of %qE discards const qualifier " + "from pointer target type", n + 1, fndecl); + type = build_qualified_type (TREE_TYPE (type), 0); + type = build_pointer_type (type); + arg = fold_convert (type, arg); + } + + /* For RS6000_OVLD_VEC_LXVL, convert any const * to its non constant + equivalent to simplify the overload matching below. */ + if (fcode == RS6000_OVLD_VEC_LXVL) + { + if (POINTER_TYPE_P (type) + && TYPE_READONLY (TREE_TYPE (type))) + { + type = build_qualified_type (TREE_TYPE (type), 0); + type = build_pointer_type (type); + arg = fold_convert (type, arg); + } + } + + args[n] = arg; + types[n] = type; + } + + /* If the number of arguments did not match the prototype, return NULL + and the generic code will issue the appropriate error message. */ + if (!VOID_TYPE_P (TREE_VALUE (fnargs)) || n < nargs) + return NULL; + + if (fcode == RS6000_OVLD_VEC_STEP) + { + if (TREE_CODE (types[0]) != VECTOR_TYPE) + goto bad; + + return build_int_cst (NULL_TREE, TYPE_VECTOR_SUBPARTS (types[0])); + } + + { + bool unsupported_builtin = false; + enum rs6000_gen_builtins overloaded_code; + bool supported = false; + ovlddata *instance = rs6000_overload_info[adj_fcode].first_instance; + gcc_assert (instance != NULL); + + /* Need to special case __builtin_cmpb because the overloaded forms + of this function take (unsigned int, unsigned int) or (unsigned + long long int, unsigned long long int). Since C conventions + allow the respective argument types to be implicitly coerced into + each other, the default handling does not provide adequate + discrimination between the desired forms of the function. */ + if (fcode == RS6000_OVLD_SCAL_CMPB) + { + machine_mode arg1_mode = TYPE_MODE (types[0]); + machine_mode arg2_mode = TYPE_MODE (types[1]); + + if (nargs != 2) + { + error ("builtin %qs only accepts 2 arguments", "__builtin_cmpb"); + return error_mark_node; + } + + /* If any supplied arguments are wider than 32 bits, resolve to + 64-bit variant of built-in function. */ + if (GET_MODE_PRECISION (arg1_mode) > 32 + || GET_MODE_PRECISION (arg2_mode) > 32) + /* Assure all argument and result types are compatible with + the built-in function represented by RS6000_BIF_CMPB. */ + overloaded_code = RS6000_BIF_CMPB; + else + /* Assure all argument and result types are compatible with + the built-in function represented by RS6000_BIF_CMPB_32. */ + overloaded_code = RS6000_BIF_CMPB_32; + + while (instance && instance->bifid != overloaded_code) + instance = instance->next; + + gcc_assert (instance != NULL); + tree fntype = rs6000_builtin_info_x[instance->bifid].fntype; + tree parmtype0 = TREE_VALUE (TYPE_ARG_TYPES (fntype)); + tree parmtype1 = TREE_VALUE (TREE_CHAIN (TYPE_ARG_TYPES (fntype))); + + if (rs6000_new_builtin_type_compatible (types[0], parmtype0) + && rs6000_new_builtin_type_compatible (types[1], parmtype1)) + { + if (rs6000_builtin_decl (instance->bifid, false) != error_mark_node + && rs6000_new_builtin_is_supported (instance->bifid)) + { + tree ret_type = TREE_TYPE (instance->fntype); + return altivec_build_new_resolved_builtin (args, n, fntype, + ret_type, + instance->bifid, + fcode); + } + else + unsupported_builtin = true; + } + } + else if (fcode == RS6000_OVLD_VEC_VSIE) + { + machine_mode arg1_mode = TYPE_MODE (types[0]); + + if (nargs != 2) + { + error ("builtin %qs only accepts 2 arguments", + "scalar_insert_exp"); + return error_mark_node; + } + + /* If supplied first argument is wider than 64 bits, resolve to + 128-bit variant of built-in function. */ + if (GET_MODE_PRECISION (arg1_mode) > 64) + { + /* If first argument is of float variety, choose variant + that expects __ieee128 argument. Otherwise, expect + __int128 argument. */ + if (GET_MODE_CLASS (arg1_mode) == MODE_FLOAT) + overloaded_code = RS6000_BIF_VSIEQPF; + else + overloaded_code = RS6000_BIF_VSIEQP; + } + else + { + /* If first argument is of float variety, choose variant + that expects double argument. Otherwise, expect + long long int argument. */ + if (GET_MODE_CLASS (arg1_mode) == MODE_FLOAT) + overloaded_code = RS6000_BIF_VSIEDPF; + else + overloaded_code = RS6000_BIF_VSIEDP; + } + + while (instance && instance->bifid != overloaded_code) + instance = instance->next; + + gcc_assert (instance != NULL); + tree fntype = rs6000_builtin_info_x[instance->bifid].fntype; + tree parmtype0 = TREE_VALUE (TYPE_ARG_TYPES (fntype)); + tree parmtype1 = TREE_VALUE (TREE_CHAIN (TYPE_ARG_TYPES (fntype))); + + if (rs6000_new_builtin_type_compatible (types[0], parmtype0) + && rs6000_new_builtin_type_compatible (types[1], parmtype1)) + { + if (rs6000_builtin_decl (instance->bifid, false) != error_mark_node + && rs6000_new_builtin_is_supported (instance->bifid)) + { + tree ret_type = TREE_TYPE (instance->fntype); + return altivec_build_new_resolved_builtin (args, n, fntype, + ret_type, + instance->bifid, + fcode); + } + else + unsupported_builtin = true; + } + } + else + { + /* Functions with no arguments can have only one overloaded + instance. */ + gcc_assert (n > 0 || !instance->next); + + for (; instance != NULL; instance = instance->next) + { + bool mismatch = false; + tree nextparm = TYPE_ARG_TYPES (instance->fntype); + + for (unsigned int arg_i = 0; + arg_i < nargs && nextparm != NULL; + arg_i++) + { + tree parmtype = TREE_VALUE (nextparm); + if (!rs6000_new_builtin_type_compatible (types[arg_i], + parmtype)) + { + mismatch = true; + break; + } + nextparm = TREE_CHAIN (nextparm); + } + + if (mismatch) + continue; + + supported = rs6000_new_builtin_is_supported (instance->bifid); + if (rs6000_builtin_decl (instance->bifid, false) != error_mark_node + && supported) + { + tree fntype = rs6000_builtin_info_x[instance->bifid].fntype; + tree ret_type = TREE_TYPE (instance->fntype); + return altivec_build_new_resolved_builtin (args, n, fntype, + ret_type, + instance->bifid, + fcode); + } + else + { + unsupported_builtin = true; + break; + } + } + } + + if (unsupported_builtin) + { + const char *name = rs6000_overload_info[adj_fcode].ovld_name; + if (!supported) + { + const char *internal_name + = rs6000_builtin_info_x[instance->bifid].bifname; + /* An error message making reference to the name of the + non-overloaded function has already been issued. Add + clarification of the previous message. */ + rich_location richloc (line_table, input_location); + inform (&richloc, "builtin %qs requires builtin %qs", + name, internal_name); + } + else + error ("%qs is not supported in this compiler configuration", name); + + return error_mark_node; + } + } + bad: + { + const char *name = rs6000_overload_info[adj_fcode].ovld_name; + error ("invalid parameter combination for AltiVec intrinsic %qs", name); + return error_mark_node; + } +} diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index e8625d1..a55cb7c 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -12971,6 +12971,59 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator *gsi) return false; } +/* Check whether a builtin function is supported in this target + configuration. */ +bool +rs6000_new_builtin_is_supported (enum rs6000_gen_builtins fncode) +{ + switch (rs6000_builtin_info_x[(size_t) fncode].enable) + { + case ENB_ALWAYS: + return true; + case ENB_P5: + return TARGET_POPCNTB; + case ENB_P6: + return TARGET_CMPB; + case ENB_P7: + return TARGET_POPCNTD; + case ENB_P7_64: + return TARGET_POPCNTD && TARGET_POWERPC64; + case ENB_P8: + return TARGET_DIRECT_MOVE; + case ENB_P8V: + return TARGET_P8_VECTOR; + case ENB_P9: + return TARGET_MODULO; + case ENB_P9_64: + return TARGET_MODULO && TARGET_POWERPC64; + case ENB_P9V: + return TARGET_P9_VECTOR; + case ENB_P10: + return TARGET_POWER10; + case ENB_P10_64: + return TARGET_POWER10 && TARGET_POWERPC64; + case ENB_ALTIVEC: + return TARGET_ALTIVEC; + case ENB_VSX: + return TARGET_VSX; + case ENB_CELL: + return TARGET_ALTIVEC && rs6000_cpu == PROCESSOR_CELL; + case ENB_IEEE128_HW: + return TARGET_FLOAT128_HW; + case ENB_DFP: + return TARGET_DFP; + case ENB_CRYPTO: + return TARGET_CRYPTO; + case ENB_HTM: + return TARGET_HTM; + case ENB_MMA: + return TARGET_MMA; + default: + gcc_unreachable (); + } + gcc_unreachable (); +} + /* Expand an expression EXP that calls a built-in function, with result going to TARGET if that's convenient (and in mode MODE if that's convenient). diff --git a/gcc/config/rs6000/rs6000-gen-builtins.c b/gcc/config/rs6000/rs6000-gen-builtins.c index f3d6156..f65932e 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.c +++ b/gcc/config/rs6000/rs6000-gen-builtins.c @@ -2314,7 +2314,7 @@ write_decls (void) fprintf (header_file, "extern void rs6000_init_generated_builtins ();\n\n"); fprintf (header_file, - "extern bool rs6000_new_builtin_is_supported_p " + "extern bool rs6000_new_builtin_is_supported " "(rs6000_gen_builtins);\n"); fprintf (header_file, "extern tree rs6000_builtin_decl (unsigned, " diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 2570937..ad81dfb 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -21728,7 +21728,8 @@ rs6000_xcoff_encode_section_info (tree decl, rtx rtl, int first) if (decl && DECL_P (decl) && VAR_OR_FUNCTION_DECL_P (decl) - && symtab_node::get (decl)->alias == 0 + && (symtab_node::get (decl) == NULL + || symtab_node::get (decl)->alias == 0) && symname[strlen (symname) - 1] != ']') { const char *smclass = NULL; @@ -22174,7 +22175,7 @@ rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code, break; case UNSPEC: - if (XINT (x, 1) == UNSPEC_MMA_XXSETACCZ) + if (XINT (x, 1) == UNSPECV_MMA_XXSETACCZ) { *total = 0; return true; diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 3753de1..c1cb9ab 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -561,10 +561,6 @@ mpower9-minmax Target Undocumented Mask(P9_MINMAX) Var(rs6000_isa_flags) Use the new min/max instructions defined in ISA 3.0. -mtoc-fusion -Target Undocumented Mask(TOC_FUSION) Var(rs6000_isa_flags) -Fuse medium/large code model toc references with the memory instruction. - mmodulo Target Undocumented Mask(MODULO) Var(rs6000_isa_flags) Generate the integer modulo instructions. diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index 92766d8..d48a4b1 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -44,15 +44,11 @@ rs6000-logue.o: $(srcdir)/config/rs6000/rs6000-logue.c $(COMPILE) $< $(POSTCOMPILE) -rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.c - $(COMPILE) $< - $(POSTCOMPILE) - -rbtree.o: $(srcdir)/config/rs6000/rbtree.c - $(COMPILE) $< - $(POSTCOMPILE) +build/rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.c +build/rbtree.o: $(srcdir)/config/rs6000/rbtree.c -rs6000-gen-builtins: rs6000-gen-builtins.o rbtree.o +build/rs6000-gen-builtins$(build_exeext): build/rs6000-gen-builtins.o \ + build/rbtree.o $(BUILD_LIBDEPS) $(LINKER_FOR_BUILD) $(BUILD_LINKERFLAGS) $(BUILD_LDFLAGS) -o $@ \ $(filter-out $(BUILD_LIBDEPS), $^) $(BUILD_LIBS) @@ -62,10 +58,11 @@ rs6000-gen-builtins: rs6000-gen-builtins.o rbtree.o # <recipe> # For now, the header files depend on rs6000-builtins.c, which avoids # races because the .c file is closed last in rs6000-gen-builtins.c. -rs6000-builtins.c: rs6000-gen-builtins \ +rs6000-builtins.c: build/rs6000-gen-builtins$(build_exeext) \ $(srcdir)/config/rs6000/rs6000-builtin-new.def \ $(srcdir)/config/rs6000/rs6000-overload.def - ./rs6000-gen-builtins $(srcdir)/config/rs6000/rs6000-builtin-new.def \ + $(RUN_GEN) ./build/rs6000-gen-builtins$(build_exeext) \ + $(srcdir)/config/rs6000/rs6000-builtin-new.def \ $(srcdir)/config/rs6000/rs6000-overload.def rs6000-builtins.h \ rs6000-builtins.c rs6000-vecdefines.h diff --git a/gcc/config/sparc/leon5.md b/gcc/config/sparc/leon5.md new file mode 100644 index 0000000..6a065b1 --- /dev/null +++ b/gcc/config/sparc/leon5.md @@ -0,0 +1,103 @@ +;; Scheduling description for LEON5. +;; Copyright (C) 2021 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; <http://www.gnu.org/licenses/>. + + +;; The LEON5 can often dual issue instructions from the same 64-bit aligned +;; double word if there are no data dependencies. +;; +;; Avoid scheduling load/store, FPU, and multiply instructions back to +;; back, regardless of data dependencies. +;; +;; Push comparisons away from the associated branch instruction. +;; +;; Avoid scheduling ALU instructions with data dependencies back to back. +;; +;; Schedule three instructions between load and dependent instruction. + +(define_automaton "leon5") + +(define_cpu_unit "leon5_memory" "leon5") +(define_cpu_unit "leon5_mul" "leon5") +(define_cpu_unit "grfpu_d" "grfpu") +(define_cpu_unit "grfpu_s" "grfpu") + +(define_insn_reservation "leon5_load" 4 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "load,sload")) + "leon5_memory * 2, nothing * 2") + +(define_insn_reservation "leon5_fpload" 2 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "fpload")) + "leon5_memory * 2 + grfpu_alu * 2") + +(define_insn_reservation "leon5_store" 2 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "store")) + "leon5_memory * 2") + +(define_insn_reservation "leon5_fpstore" 2 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "fpstore")) + "leon5_memory * 2 + grfpu_alu * 2") + +(define_insn_reservation "leon5_ialu" 2 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "ialu, shift, ialuX")) + "nothing * 2") + +(define_insn_reservation "leon5_compare" 5 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "compare")) + "nothing * 5") + +(define_insn_reservation "leon5_imul" 4 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "imul")) + "leon5_mul * 2, nothing * 2") + +(define_insn_reservation "leon5_idiv" 35 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "imul")) + "nothing * 35") + +(define_insn_reservation "leon5_fp_alu" 5 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "fp,fpcmp,fpmul,fpmove")) + "grfpu_alu * 2, nothing*3") + +(define_insn_reservation "leon5_fp_divs" 17 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "fpdivs")) + "grfpu_alu * 2 + grfpu_d*16, nothing") + +(define_insn_reservation "leon5_fp_divd" 18 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "fpdivd")) + "grfpu_alu * 2 + grfpu_d*17, nothing") + +(define_insn_reservation "leon5_fp_sqrts" 25 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "fpsqrts")) + "grfpu_alu * 2 + grfpu_s*24, nothing") + +(define_insn_reservation "leon5_fp_sqrtd" 26 + (and (eq_attr "cpu" "leon5") + (eq_attr "type" "fpsqrtd")) + "grfpu_alu * 2 + grfpu_s*25, nothing") diff --git a/gcc/config/sparc/sparc-opts.h b/gcc/config/sparc/sparc-opts.h index 1af556e..9299cf6 100644 --- a/gcc/config/sparc/sparc-opts.h +++ b/gcc/config/sparc/sparc-opts.h @@ -31,6 +31,7 @@ enum sparc_processor_type { PROCESSOR_HYPERSPARC, PROCESSOR_LEON, PROCESSOR_LEON3, + PROCESSOR_LEON5, PROCESSOR_LEON3V7, PROCESSOR_SPARCLITE, PROCESSOR_F930, diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c index 06f41d7..6bc6f0a 100644 --- a/gcc/config/sparc/sparc.c +++ b/gcc/config/sparc/sparc.c @@ -270,6 +270,31 @@ struct processor_costs leon3_costs = { }; static const +struct processor_costs leon5_costs = { + COSTS_N_INSNS (1), /* int load */ + COSTS_N_INSNS (1), /* int signed load */ + COSTS_N_INSNS (1), /* int zeroed load */ + COSTS_N_INSNS (1), /* float load */ + COSTS_N_INSNS (1), /* fmov, fneg, fabs */ + COSTS_N_INSNS (1), /* fadd, fsub */ + COSTS_N_INSNS (1), /* fcmp */ + COSTS_N_INSNS (1), /* fmov, fmovr */ + COSTS_N_INSNS (1), /* fmul */ + COSTS_N_INSNS (17), /* fdivs */ + COSTS_N_INSNS (18), /* fdivd */ + COSTS_N_INSNS (25), /* fsqrts */ + COSTS_N_INSNS (26), /* fsqrtd */ + COSTS_N_INSNS (4), /* imul */ + COSTS_N_INSNS (4), /* imulX */ + 0, /* imul bit factor */ + COSTS_N_INSNS (35), /* idiv */ + COSTS_N_INSNS (35), /* idivX */ + COSTS_N_INSNS (1), /* movcc/movr */ + 0, /* shift penalty */ + 3 /* branch cost */ +}; + +static const struct processor_costs sparclet_costs = { COSTS_N_INSNS (3), /* int load */ COSTS_N_INSNS (3), /* int signed load */ @@ -575,6 +600,7 @@ static int function_arg_slotno (const CUMULATIVE_ARGS *, machine_mode, static int supersparc_adjust_cost (rtx_insn *, int, rtx_insn *, int); static int hypersparc_adjust_cost (rtx_insn *, int, rtx_insn *, int); +static int leon5_adjust_cost (rtx_insn *, int, rtx_insn *, int); static void sparc_emit_set_const32 (rtx, rtx); static void sparc_emit_set_const64 (rtx, rtx); @@ -1045,6 +1071,43 @@ atomic_insn_for_leon3_p (rtx_insn *insn) } } +/* True if INSN is a store instruction. */ + +static bool +store_insn_p (rtx_insn *insn) +{ + if (GET_CODE (PATTERN (insn)) != SET) + return false; + + switch (get_attr_type (insn)) + { + case TYPE_STORE: + case TYPE_FPSTORE: + return true; + default: + return false; + } +} + +/* True if INSN is a load instruction. */ + +static bool +load_insn_p (rtx_insn *insn) +{ + if (GET_CODE (PATTERN (insn)) != SET) + return false; + + switch (get_attr_type (insn)) + { + case TYPE_LOAD: + case TYPE_SLOAD: + case TYPE_FPLOAD: + return true; + default: + return false; + } +} + /* We use a machine specific pass to enable workarounds for errata. We need to have the (essentially) final form of the insn stream in order @@ -1057,10 +1120,29 @@ atomic_insn_for_leon3_p (rtx_insn *insn) && GET_CODE (PATTERN (INSN)) != USE \ && GET_CODE (PATTERN (INSN)) != CLOBBER) +rtx_insn * +next_active_non_empty_insn (rtx_insn *insn) +{ + insn = next_active_insn (insn); + + while (insn + && (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE + || GET_CODE (PATTERN (insn)) == ASM_INPUT + || (USEFUL_INSN_P (insn) + && (asm_noperands (PATTERN (insn)) >= 0) + && !strcmp (decode_asm_operands (PATTERN (insn), + NULL, NULL, NULL, + NULL, NULL), "")))) + insn = next_active_insn (insn); + + return insn; +} + static unsigned int sparc_do_work_around_errata (void) { rtx_insn *insn, *next; + bool find_first_useful = true; /* Force all instructions to be split into their final form. */ split_all_insns_noflow (); @@ -1085,6 +1167,16 @@ sparc_do_work_around_errata (void) else jump = NULL; + /* Do not begin function with atomic instruction. */ + if (sparc_fix_ut700 + && find_first_useful + && USEFUL_INSN_P (insn)) + { + find_first_useful = false; + if (atomic_insn_for_leon3_p (insn)) + emit_insn_before (gen_nop (), insn); + } + /* Place a NOP at the branch target of an integer branch if it is a floating-point operation or a floating-point branch. */ if (sparc_fix_gr712rc @@ -1105,9 +1197,7 @@ sparc_do_work_around_errata (void) instruction at branch target. */ if (sparc_fix_ut700 && NONJUMP_INSN_P (insn) - && (set = single_set (insn)) != NULL_RTX - && mem_ref (SET_SRC (set)) - && REG_P (SET_DEST (set))) + && load_insn_p (insn)) { if (jump && jump_to_label_p (jump)) { @@ -1116,7 +1206,7 @@ sparc_do_work_around_errata (void) emit_insn_before (gen_nop (), target); } - next = next_active_insn (insn); + next = next_active_non_empty_insn (insn); if (!next) break; @@ -1212,30 +1302,19 @@ sparc_do_work_around_errata (void) if (sparc_fix_b2bst && NONJUMP_INSN_P (insn) && (set = single_set (insn)) != NULL_RTX - && MEM_P (SET_DEST (set))) + && store_insn_p (insn)) { /* Sequence B begins with a double-word store. */ bool seq_b = GET_MODE_SIZE (GET_MODE (SET_DEST (set))) == 8; rtx_insn *after; int i; - next = next_active_insn (insn); + next = next_active_non_empty_insn (insn); if (!next) break; for (after = next, i = 0; i < 2; i++) { - /* Skip empty assembly statements. */ - if ((GET_CODE (PATTERN (after)) == UNSPEC_VOLATILE) - || (USEFUL_INSN_P (after) - && (asm_noperands (PATTERN (after))>=0) - && !strcmp (decode_asm_operands (PATTERN (after), - NULL, NULL, NULL, - NULL, NULL), ""))) - after = next_active_insn (after); - if (!after) - break; - /* If the insn is a branch, then it cannot be problematic. */ if (!NONJUMP_INSN_P (after) || GET_CODE (PATTERN (after)) == SEQUENCE) @@ -1245,8 +1324,7 @@ sparc_do_work_around_errata (void) if (seq_b) { /* Add NOP if followed by a store. */ - if ((set = single_set (after)) != NULL_RTX - && MEM_P (SET_DEST (set))) + if (store_insn_p (after)) insert_nop = true; /* Otherwise it is ok. */ @@ -1261,15 +1339,14 @@ sparc_do_work_around_errata (void) && (MEM_P (SET_DEST (set)) || mem_ref (SET_SRC (set)))) break; - after = next_active_insn (after); + after = next_active_non_empty_insn (after); if (!after) break; } /* Add NOP if third instruction is a store. */ if (i == 1 - && (set = single_set (after)) != NULL_RTX - && MEM_P (SET_DEST (set))) + && store_insn_p (after)) insert_nop = true; } } @@ -1596,6 +1673,10 @@ dump_target_flag_bits (const int flags) fprintf (stderr, "CBCOND "); if (flags & MASK_DEPRECATED_V8_INSNS) fprintf (stderr, "DEPRECATED_V8_INSNS "); + if (flags & MASK_LEON) + fprintf (stderr, "LEON "); + if (flags & MASK_LEON3) + fprintf (stderr, "LEON3 "); if (flags & MASK_SPARCLET) fprintf (stderr, "SPARCLET "); if (flags & MASK_SPARCLITE) @@ -1632,6 +1713,7 @@ sparc_option_override (void) { TARGET_CPU_hypersparc, PROCESSOR_HYPERSPARC }, { TARGET_CPU_leon, PROCESSOR_LEON }, { TARGET_CPU_leon3, PROCESSOR_LEON3 }, + { TARGET_CPU_leon5, PROCESSOR_LEON5 }, { TARGET_CPU_leon3v7, PROCESSOR_LEON3V7 }, { TARGET_CPU_sparclite, PROCESSOR_F930 }, { TARGET_CPU_sparclite86x, PROCESSOR_SPARCLITE86X }, @@ -1663,6 +1745,7 @@ sparc_option_override (void) { "hypersparc", MASK_ISA, MASK_V8 }, { "leon", MASK_ISA|MASK_FSMULD, MASK_V8|MASK_LEON }, { "leon3", MASK_ISA, MASK_V8|MASK_LEON3 }, + { "leon5", MASK_ISA, MASK_V8|MASK_LEON3 }, { "leon3v7", MASK_ISA, MASK_LEON3 }, { "sparclite", MASK_ISA, MASK_SPARCLITE }, /* The Fujitsu MB86930 is the original sparclite chip, with no FPU. */ @@ -1973,6 +2056,9 @@ sparc_option_override (void) case PROCESSOR_LEON3V7: sparc_costs = &leon3_costs; break; + case PROCESSOR_LEON5: + sparc_costs = &leon5_costs; + break; case PROCESSOR_SPARCLET: case PROCESSOR_TSC701: sparc_costs = &sparclet_costs; @@ -10120,11 +10206,64 @@ hypersparc_adjust_cost (rtx_insn *insn, int dtype, rtx_insn *dep_insn, } static int +leon5_adjust_cost (rtx_insn *insn, int dtype, rtx_insn *dep_insn, + int cost) +{ + enum attr_type insn_type, dep_type; + rtx pat = PATTERN (insn); + rtx dep_pat = PATTERN (dep_insn); + + if (recog_memoized (insn) < 0 || recog_memoized (dep_insn) < 0) + return cost; + + insn_type = get_attr_type (insn); + dep_type = get_attr_type (dep_insn); + + switch (dtype) + { + case REG_DEP_TRUE: + /* Data dependency; DEP_INSN writes a register that INSN reads some + cycles later. */ + + switch (insn_type) + { + case TYPE_STORE: + /* Try to schedule three instructions between the store and + the ALU instruction that generated the data. */ + if (dep_type == TYPE_IALU || dep_type == TYPE_SHIFT) + { + if (GET_CODE (pat) != SET || GET_CODE (dep_pat) != SET) + break; + + if (rtx_equal_p (SET_DEST (dep_pat), SET_SRC (pat))) + return 4; + } + break; + default: + break; + } + break; + case REG_DEP_ANTI: + /* Penalize anti-dependencies for FPU instructions. */ + if (fpop_insn_p (insn) || insn_type == TYPE_FPLOAD) + return 4; + break; + default: + break; + } + + return cost; +} + +static int sparc_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep, int cost, unsigned int) { switch (sparc_cpu) { + case PROCESSOR_LEON5: + cost = leon5_adjust_cost (insn, dep_type, dep, cost); + break; case PROCESSOR_SUPERSPARC: cost = supersparc_adjust_cost (insn, dep_type, dep, cost); break; diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h index 4da5a06..edafa99 100644 --- a/gcc/config/sparc/sparc.h +++ b/gcc/config/sparc/sparc.h @@ -120,21 +120,22 @@ along with GCC; see the file COPYING3. If not see #define TARGET_CPU_leon 4 #define TARGET_CPU_leon3 5 #define TARGET_CPU_leon3v7 6 -#define TARGET_CPU_sparclite 7 -#define TARGET_CPU_f930 7 /* alias */ -#define TARGET_CPU_f934 7 /* alias */ -#define TARGET_CPU_sparclite86x 8 -#define TARGET_CPU_sparclet 9 -#define TARGET_CPU_tsc701 9 /* alias */ -#define TARGET_CPU_v9 10 /* generic v9 implementation */ -#define TARGET_CPU_sparcv9 10 /* alias */ -#define TARGET_CPU_sparc64 10 /* alias */ -#define TARGET_CPU_ultrasparc 11 -#define TARGET_CPU_ultrasparc3 12 -#define TARGET_CPU_niagara 13 -#define TARGET_CPU_niagara2 14 -#define TARGET_CPU_niagara3 15 -#define TARGET_CPU_niagara4 16 +#define TARGET_CPU_leon5 7 +#define TARGET_CPU_sparclite 8 +#define TARGET_CPU_f930 8 /* alias */ +#define TARGET_CPU_f934 8 /* alias */ +#define TARGET_CPU_sparclite86x 9 +#define TARGET_CPU_sparclet 10 +#define TARGET_CPU_tsc701 10 /* alias */ +#define TARGET_CPU_v9 11 /* generic v9 implementation */ +#define TARGET_CPU_sparcv9 11 /* alias */ +#define TARGET_CPU_sparc64 11 /* alias */ +#define TARGET_CPU_ultrasparc 12 +#define TARGET_CPU_ultrasparc3 13 +#define TARGET_CPU_niagara 14 +#define TARGET_CPU_niagara2 15 +#define TARGET_CPU_niagara3 16 +#define TARGET_CPU_niagara4 17 #define TARGET_CPU_niagara7 19 #define TARGET_CPU_m8 20 @@ -229,7 +230,8 @@ along with GCC; see the file COPYING3. If not see #endif #if TARGET_CPU_DEFAULT == TARGET_CPU_leon \ - || TARGET_CPU_DEFAULT == TARGET_CPU_leon3 + || TARGET_CPU_DEFAULT == TARGET_CPU_leon3 \ + || TARGET_CPU_DEFAULT == TARGET_CPU_leon5 #define CPP_CPU32_DEFAULT_SPEC "-D__leon__ -D__sparc_v8__" #define ASM_CPU32_DEFAULT_SPEC AS_LEON_FLAG #endif @@ -285,6 +287,7 @@ along with GCC; see the file COPYING3. If not see %{mcpu=hypersparc:-D__hypersparc__ -D__sparc_v8__} \ %{mcpu=leon:-D__leon__ -D__sparc_v8__} \ %{mcpu=leon3:-D__leon__ -D__sparc_v8__} \ +%{mcpu=leon5:-D__leon__ -D__sparc_v8__} \ %{mcpu=leon3v7:-D__leon__} \ %{mcpu=v9:-D__sparc_v9__} \ %{mcpu=ultrasparc:-D__sparc_v9__} \ @@ -337,6 +340,7 @@ along with GCC; see the file COPYING3. If not see %{mcpu=hypersparc:-Av8} \ %{mcpu=leon:" AS_LEON_FLAG "} \ %{mcpu=leon3:" AS_LEON_FLAG "} \ +%{mcpu=leon5:" AS_LEON_FLAG "} \ %{mcpu=leon3v7:" AS_LEONV7_FLAG "} \ %{mv8plus:-Av8plus} \ %{mcpu=v9:-Av9} \ diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md index 24b76e0..294c918 100644 --- a/gcc/config/sparc/sparc.md +++ b/gcc/config/sparc/sparc.md @@ -233,6 +233,7 @@ hypersparc, leon, leon3, + leon5, leon3v7, sparclite, f930, @@ -638,6 +639,7 @@ (include "supersparc.md") (include "hypersparc.md") (include "leon.md") +(include "leon5.md") (include "sparclet.md") (include "ultra1_2.md") (include "ultra3.md") @@ -8353,9 +8355,15 @@ visl") (unspec:SI [(match_operand:SI 1 "memory_operand" "m")] UNSPEC_SP_SET)) (set (match_scratch:SI 2 "=&r") (const_int 0))] "TARGET_ARCH32" - "ld\t%1, %2\;st\t%2, %0\;mov\t0, %2" +{ + if (sparc_fix_b2bst) + return "ld\t%1, %2\;st\t%2, %0\;mov\t0, %2\;nop"; + else + return "ld\t%1, %2\;st\t%2, %0\;mov\t0, %2"; +} [(set_attr "type" "multi") - (set_attr "length" "3")]) + (set (attr "length") (if_then_else (eq_attr "fix_b2bst" "true") + (const_int 4) (const_int 3)))]) (define_insn "stack_protect_set64" [(set (match_operand:DI 0 "memory_operand" "=m") diff --git a/gcc/config/sparc/sparc.opt b/gcc/config/sparc/sparc.opt index fb79267..658a187 100644 --- a/gcc/config/sparc/sparc.opt +++ b/gcc/config/sparc/sparc.opt @@ -176,6 +176,9 @@ EnumValue Enum(sparc_processor) String(leon3v7) Value(PROCESSOR_LEON3V7) EnumValue +Enum(sparc_processor) String(leon5) Value(PROCESSOR_LEON5) + +EnumValue Enum(sparc_processor) String(sparclite) Value(PROCESSOR_SPARCLITE) EnumValue diff --git a/gcc/config/xtensa/t-xtensa b/gcc/config/xtensa/t-xtensa index 973815c..d06e492 100644 --- a/gcc/config/xtensa/t-xtensa +++ b/gcc/config/xtensa/t-xtensa @@ -16,4 +16,5 @@ # along with GCC; see the file COPYING3. If not see # <http://www.gnu.org/licenses/>. +TM_H += $(srcdir)/../include/xtensa-config.h $(out_object_file): gt-xtensa.h |