diff options
author | Haochen Jiang <haochen.jiang@intel.com> | 2023-10-09 16:09:28 +0800 |
---|---|---|
committer | Haochen Jiang <haochen.jiang@intel.com> | 2023-10-09 17:02:27 +0800 |
commit | 79fb4764434fe7e77d811c8ed9381c95d6c42594 (patch) | |
tree | f58c9fa0ab4f3fa102bcc30bdeea9f09cf708f9c /gcc | |
parent | 6882df700cebf5a2292566c5acb2480f7dafd116 (diff) | |
download | gcc-79fb4764434fe7e77d811c8ed9381c95d6c42594.zip gcc-79fb4764434fe7e77d811c8ed9381c95d6c42594.tar.gz gcc-79fb4764434fe7e77d811c8ed9381c95d6c42594.tar.bz2 |
[PATCH 1/5] Push evex512 target for 512 bit intrins
gcc/ChangeLog:
* config/i386/avx512fintrin.h: Add evex512 target for 512 bit intrins.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/avx512fintrin.h | 7411 |
1 files changed, 3745 insertions, 3666 deletions
diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h index 517e787..85bf72d 100644 --- a/gcc/config/i386/avx512fintrin.h +++ b/gcc/config/i386/avx512fintrin.h @@ -34,6 +34,3748 @@ #define __DISABLE_AVX512F__ #endif /* __AVX512F__ */ +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; +typedef unsigned int __mmask32; + +/* Constants for mantissa extraction */ +typedef enum +{ + _MM_MANT_NORM_1_2, /* interval [1, 2) */ + _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */ + _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */ + _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */ +} _MM_MANTISSA_NORM_ENUM; + +typedef enum +{ + _MM_MANT_SIGN_src, /* sign = sign(SRC) */ + _MM_MANT_SIGN_zero, /* sign = 0 */ + _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */ +} _MM_MANTISSA_SIGN_ENUM; + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_addss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_subss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +#else +#define _mm_add_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_addsd_round(A, B, C) + +#define _mm_mask_add_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_addsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_add_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_addsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_add_round_ss(A, B, C) \ + (__m128)__builtin_ia32_addss_round(A, B, C) + +#define _mm_mask_add_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_addss_mask_round(A, B, W, U, C) + +#define _mm_maskz_add_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_addss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_sub_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_subsd_round(A, B, C) + +#define _mm_mask_sub_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_subsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_sub_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_subsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_sub_round_ss(A, B, C) \ + (__m128)__builtin_ia32_subss_round(A, B, C) + +#define _mm_mask_sub_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_subss_mask_round(A, B, W, U, C) + +#define _mm_maskz_sub_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_subss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#endif + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __B, + (__v2df) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __B, + (__v4sf) __A); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __B, + (__v2df) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __B, + (__v4sf) __A); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_round_sd (__mmask8 __U, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_divss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} +#else +#define _mm_sqrt_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \ + (__v2df) _mm_setzero_pd (), -1, C) + +#define _mm_mask_sqrt_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, W, U, C) + +#define _mm_maskz_sqrt_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \ + (__v2df) _mm_setzero_pd (), U, C) + +#define _mm_sqrt_round_ss(A, B, C) \ + (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \ + (__v4sf) _mm_setzero_ps (), -1, C) + +#define _mm_mask_sqrt_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_sqrtss_mask_round (B, A, W, U, C) + +#define _mm_maskz_sqrt_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \ + (__v4sf) _mm_setzero_ps (), U, C) + +#define _mm_mul_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_mulsd_round(A, B, C) + +#define _mm_mask_mul_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_mulsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_mul_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_mulsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_mul_round_ss(A, B, C) \ + (__m128)__builtin_ia32_mulss_round(A, B, C) + +#define _mm_mask_mul_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_mulss_mask_round(A, B, W, U, C) + +#define _mm_maskz_mul_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_mulss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_div_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_divsd_round(A, B, C) + +#define _mm_mask_div_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_divsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_div_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_divsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_div_round_ss(A, B, C) \ + (__m128)__builtin_ia32_divss_round(A, B, C) + +#define _mm_mask_div_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_divss_mask_round(A, B, W, U, C) + +#define _mm_maskz_div_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_divss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_scalef_round_sd(A, B, C) \ + ((__m128d) \ + __builtin_ia32_scalefsd_mask_round ((A), (B), \ + (__v2df) _mm_undefined_pd (), \ + -1, (C))) + +#define _mm_scalef_round_ss(A, B, C) \ + ((__m128) \ + __builtin_ia32_scalefss_mask_round ((A), (B), \ + (__v4sf) _mm_undefined_ps (), \ + -1, (C))) + +#define _mm_mask_scalef_round_sd(W, U, A, B, C) \ + ((__m128d) \ + __builtin_ia32_scalefsd_mask_round ((A), (B), (W), (U), (C))) + +#define _mm_mask_scalef_round_ss(W, U, A, B, C) \ + ((__m128) \ + __builtin_ia32_scalefss_mask_round ((A), (B), (W), (U), (C))) + +#define _mm_maskz_scalef_round_sd(U, A, B, C) \ + ((__m128d) \ + __builtin_ia32_scalefsd_mask_round ((A), (B), \ + (__v2df) _mm_setzero_pd (), \ + (U), (C))) + +#define _mm_maskz_scalef_round_ss(U, A, B, C) \ + ((__m128) \ + __builtin_ia32_scalefss_mask_round ((A), (B), \ + (__v4sf) _mm_setzero_ps (), \ + (U), (C))) +#endif + +#define _mm_mask_sqrt_sd(W, U, A, B) \ + _mm_mask_sqrt_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_sqrt_sd(U, A, B) \ + _mm_maskz_sqrt_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_sqrt_ss(W, U, A, B) \ + _mm_mask_sqrt_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_sqrt_ss(U, A, B) \ + _mm_maskz_sqrt_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_scalef_sd(W, U, A, B) \ + _mm_mask_scalef_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_scalef_sd(U, A, B) \ + _mm_maskz_scalef_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_scalef_ss(W, U, A, B) \ + _mm_mask_scalef_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_scalef_ss(U, A, B) \ + _mm_maskz_scalef_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu32_sd (__m128d __A, unsigned __B) +{ + return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B); +} + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu64_sd (__m128d __A, unsigned long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi64_sd (__m128d __A, long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi64_sd (__m128d __A, long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); +} +#else +#define _mm_cvt_roundu64_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtusi2sd64(A, B, C) + +#define _mm_cvt_roundi64_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) + +#define _mm_cvt_roundsi64_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) +#endif + +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu32_ss (__m128 __A, unsigned __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi32_ss (__m128 __A, int __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi32_ss (__m128 __A, int __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); +} +#else +#define _mm_cvt_roundu32_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtusi2ss32(A, B, C) + +#define _mm_cvt_roundi32_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) + +#define _mm_cvt_roundsi32_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) +#endif + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu64_ss (__m128 __A, unsigned long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi64_ss (__m128 __A, long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi64_ss (__m128 __A, long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); +} +#else +#define _mm_cvt_roundu64_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtusi2ss64(A, B, C) + +#define _mm_cvt_roundi64_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) + +#define _mm_cvt_roundsi64_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) +#endif + +#endif + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P) +{ + return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_ss (__mmask8 __U, const float *__P) +{ + return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) _mm_setzero_ps (), + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P) +{ + return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_sd (__mmask8 __U, const double *__P) +{ + return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) _mm_setzero_pd (), + __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B, + (__v4sf) __W, __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B, + (__v4sf) _mm_setzero_ps (), __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B, + (__v2df) __W, __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B, + (__v2df) _mm_setzero_pd (), + __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_round_sd (__m128d __A, __m128d __B, __m128i __C, + const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_round_sd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_round_ss (__m128 __A, __m128 __B, __m128i __C, + const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_round_ss (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, __R); +} + +#else +#define _mm_fixupimm_round_sd(X, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), (R))) + +#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm_fixupimm_round_ss(X, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), (R))) + +#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#endif + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_u64 (__m128 __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_si64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_i64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_u64 (__m128 __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_i64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_si64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); +} +#else +#define _mm_cvt_roundss_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B)) + +#define _mm_cvt_roundss_si64(A, B) \ + ((long long)__builtin_ia32_vcvtss2si64(A, B)) + +#define _mm_cvt_roundss_i64(A, B) \ + ((long long)__builtin_ia32_vcvtss2si64(A, B)) + +#define _mm_cvtt_roundss_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B)) + +#define _mm_cvtt_roundss_i64(A, B) \ + ((long long)__builtin_ia32_vcvttss2si64(A, B)) + +#define _mm_cvtt_roundss_si64(A, B) \ + ((long long)__builtin_ia32_vcvttss2si64(A, B)) +#endif +#endif + +#ifdef __OPTIMIZE__ +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_u32 (__m128 __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_si32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_i32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_u32 (__m128 __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_i32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_si32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); +} +#else +#define _mm_cvt_roundss_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvtss2usi32(A, B)) + +#define _mm_cvt_roundss_si32(A, B) \ + ((int)__builtin_ia32_vcvtss2si32(A, B)) + +#define _mm_cvt_roundss_i32(A, B) \ + ((int)__builtin_ia32_vcvtss2si32(A, B)) + +#define _mm_cvtt_roundss_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvttss2usi32(A, B)) + +#define _mm_cvtt_roundss_si32(A, B) \ + ((int)__builtin_ia32_vcvttss2si32(A, B)) + +#define _mm_cvtt_roundss_i32(A, B) \ + ((int)__builtin_ia32_vcvttss2si32(A, B)) +#endif + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_u64 (__m128d __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_si64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_i64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_u64 (__m128d __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_si64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_i64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); +} +#else +#define _mm_cvt_roundsd_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B)) + +#define _mm_cvt_roundsd_si64(A, B) \ + ((long long)__builtin_ia32_vcvtsd2si64(A, B)) + +#define _mm_cvt_roundsd_i64(A, B) \ + ((long long)__builtin_ia32_vcvtsd2si64(A, B)) + +#define _mm_cvtt_roundsd_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B)) + +#define _mm_cvtt_roundsd_si64(A, B) \ + ((long long)__builtin_ia32_vcvttsd2si64(A, B)) + +#define _mm_cvtt_roundsd_i64(A, B) \ + ((long long)__builtin_ia32_vcvttsd2si64(A, B)) +#endif +#endif + +#ifdef __OPTIMIZE__ +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_u32 (__m128d __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_si32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_i32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_u32 (__m128d __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_i32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_si32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_round ((__v4sf) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsd_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, + (__v2df) __B, + (__v4sf) __W, + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsd_ss (__mmask8 __U, __m128 __A, + __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, + (__v2df) __B, + _mm_setzero_ps (), + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_round ((__v2df) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundss_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, + (__v4sf) __B, + (__v2df) __W, + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundss_sd (__mmask8 __U, __m128d __A, + __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, + (__v4sf) __B, + _mm_setzero_pd (), + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_round_sd (__m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) __W, + __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) + _mm_setzero_pd(), + __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_round_ss (__m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) __W, + __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) + _mm_setzero_ps(), + __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_round_ss (__m128 __A, __m128 __B, const int __imm, + const int __R) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_round_ss (__m128 __A, __mmask8 __B, __m128 __C, + __m128 __D, const int __imm, const int __R) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __C, + (__v4sf) __D, __imm, + (__v4sf) __A, + (__mmask8) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_round_ss (__mmask8 __A, __m128 __B, __m128 __C, + const int __imm, const int __R) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __B, + (__v4sf) __C, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __A, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_round_sd (__m128d __A, __m128d __B, const int __imm, + const int __R) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_round_sd (__m128d __A, __mmask8 __B, __m128d __C, + __m128d __D, const int __imm, const int __R) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __C, + (__v2df) __D, __imm, + (__v2df) __A, + (__mmask8) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_round_sd (__mmask8 __A, __m128d __B, __m128d __C, + const int __imm, const int __R) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __B, + (__v2df) __C, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __A, + __R); +} + +#else +#define _mm_cvt_roundsd_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B)) + +#define _mm_cvt_roundsd_si32(A, B) \ + ((int)__builtin_ia32_vcvtsd2si32(A, B)) + +#define _mm_cvt_roundsd_i32(A, B) \ + ((int)__builtin_ia32_vcvtsd2si32(A, B)) + +#define _mm_cvtt_roundsd_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B)) + +#define _mm_cvtt_roundsd_si32(A, B) \ + ((int)__builtin_ia32_vcvttsd2si32(A, B)) + +#define _mm_cvtt_roundsd_i32(A, B) \ + ((int)__builtin_ia32_vcvttsd2si32(A, B)) + +#define _mm_cvt_roundsd_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C) + +#define _mm_mask_cvt_roundsd_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), (W), (U), (C)) + +#define _mm_maskz_cvt_roundsd_ss(U, A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), _mm_setzero_ps (), \ + (U), (C)) + +#define _mm_cvt_roundss_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C) + +#define _mm_mask_cvt_roundss_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), (W), (U), (C)) + +#define _mm_maskz_cvt_roundss_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), _mm_setzero_pd (), \ + (U), (C)) + +#define _mm_getmant_round_sd(X, Y, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (R))) + +#define _mm_mask_getmant_round_sd(W, U, X, Y, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U),\ + (R))) + +#define _mm_maskz_getmant_round_sd(U, X, Y, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v2df)(__m128d)_mm_setzero_pd(), \ + (__mmask8)(U),\ + (R))) + +#define _mm_getmant_round_ss(X, Y, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (R))) + +#define _mm_mask_getmant_round_ss(W, U, X, Y, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U),\ + (R))) + +#define _mm_maskz_getmant_round_ss(U, X, Y, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)(__m128)_mm_setzero_ps(), \ + (__mmask8)(U),\ + (R))) + +#define _mm_getexp_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), R)) + +#define _mm_mask_getexp_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U, C) + +#define _mm_maskz_getexp_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_getexp_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), R)) + +#define _mm_mask_getexp_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_getexp_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_roundscale_round_ss(A, B, I, R) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ + (__v4sf) (__m128) (B), \ + (int) (I), \ + (__v4sf) _mm_setzero_ps (), \ + (__mmask8) (-1), \ + (int) (R))) +#define _mm_mask_roundscale_round_ss(A, U, B, C, I, R) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B), \ + (__v4sf) (__m128) (C), \ + (int) (I), \ + (__v4sf) (__m128) (A), \ + (__mmask8) (U), \ + (int) (R))) +#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ + (__v4sf) (__m128) (B), \ + (int) (I), \ + (__v4sf) _mm_setzero_ps (), \ + (__mmask8) (U), \ + (int) (R))) +#define _mm_roundscale_round_sd(A, B, I, R) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ + (__v2df) (__m128d) (B), \ + (int) (I), \ + (__v2df) _mm_setzero_pd (), \ + (__mmask8) (-1), \ + (int) (R))) +#define _mm_mask_roundscale_round_sd(A, U, B, C, I, R) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B), \ + (__v2df) (__m128d) (C), \ + (int) (I), \ + (__v2df) (__m128d) (A), \ + (__mmask8) (U), \ + (int) (R))) +#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ + (__v2df) (__m128d) (B), \ + (int) (I), \ + (__v2df) _mm_setzero_pd (), \ + (__mmask8) (U), \ + (int) (R))) + +#endif + +#define _mm_mask_cvtss_sd(W, U, A, B) \ + _mm_mask_cvt_roundss_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_cvtss_sd(U, A, B) \ + _mm_maskz_cvt_roundss_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_cvtsd_ss(W, U, A, B) \ + _mm_mask_cvt_roundsd_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_cvtsd_ss(U, A, B) \ + _mm_maskz_cvt_roundsd_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#ifdef __OPTIMIZE__ +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftli_mask16 (__mmask16 __A, unsigned int __B) +{ + return (__mmask16) __builtin_ia32_kshiftlihi ((__mmask16) __A, + (__mmask8) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftri_mask16 (__mmask16 __A, unsigned int __B) +{ + return (__mmask16) __builtin_ia32_kshiftrihi ((__mmask16) __A, + (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_round_sd_mask (__m128d __X, __m128d __Y, const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_round_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __M, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_round_ss_mask (__m128 __X, __m128 __Y, const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __M, __R); +} + +#else +#define _kshiftli_mask16(X, Y) \ + ((__mmask16) __builtin_ia32_kshiftlihi ((__mmask16)(X), (__mmask8)(Y))) + +#define _kshiftri_mask16(X, Y) \ + ((__mmask16) __builtin_ia32_kshiftrihi ((__mmask16)(X), (__mmask8)(Y))) + +#define _mm_cmp_round_sd_mask(X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (__mmask8)-1, R)) + +#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (M), R)) + +#define _mm_cmp_round_ss_mask(X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)-1, R)) + +#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (M), R)) + +#endif + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestchi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzhi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask16_u8 (__mmask16 __A, __mmask16 __B) +{ + return (unsigned char) __builtin_ia32_kortestzhi ((__mmask16) __A, + (__mmask16) __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask16_u8 (__mmask16 __A, __mmask16 __B) +{ + return (unsigned char) __builtin_ia32_kortestchi ((__mmask16) __A, + (__mmask16) __B); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask16_u32 (__mmask16 __A) +{ + return (unsigned int) __builtin_ia32_kmovw ((__mmask16 ) __A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu32_mask16 (unsigned int __A) +{ + return (__mmask16) __builtin_ia32_kmovw ((__mmask16 ) __A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask16 (__mmask16 *__A) +{ + return (__mmask16) __builtin_ia32_kmovw (*(__mmask16 *) __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask16 (__mmask16 *__A, __mmask16 __B) +{ + *(__mmask16 *) __A = __builtin_ia32_kmovw (__B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask16 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask16 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, + (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask16 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask16 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask16 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask16 (__mmask16 __A) +{ + return (__mmask16) __builtin_ia32_knothi ((__mmask16) __A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackb_mask16 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_minss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +#else +#define _mm_max_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_maxsd_round(A, B, C) + +#define _mm_mask_max_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_maxsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_max_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_maxsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_max_round_ss(A, B, C) \ + (__m128)__builtin_ia32_maxss_round(A, B, C) + +#define _mm_mask_max_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_maxss_mask_round(A, B, W, U, C) + +#define _mm_maskz_max_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_maxss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_min_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_minsd_round(A, B, C) + +#define _mm_mask_min_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_minsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_min_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_minsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_min_round_ss(A, B, C) \ + (__m128)__builtin_ia32_minss_round(A, B, C) + +#define _mm_mask_min_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_minss_mask_round(A, B, W, U, C) + +#define _mm_maskz_min_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_minss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + __R); +} +#else +#define _mm_fmadd_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, C, R) + +#define _mm_fmadd_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, B, C, R) + +#define _mm_fmsub_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, -(C), R) + +#define _mm_fmsub_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, B, -(C), R) + +#define _mm_fnmadd_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), C, R) + +#define _mm_fnmadd_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), C, R) + +#define _mm_fnmsub_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R) + +#define _mm_fnmsub_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), -(C), R) +#endif + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} +#else +#define _mm_mask_fmadd_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, C, U, R) + +#define _mm_mask_fmadd_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask (A, B, C, U, R) + +#define _mm_mask3_fmadd_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, B, C, U, R) + +#define _mm_mask3_fmadd_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask3 (A, B, C, U, R) + +#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, C, U, R) + +#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, C, U, R) + +#define _mm_mask_fmsub_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, -(C), U, R) + +#define _mm_mask_fmsub_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask (A, B, -(C), U, R) + +#define _mm_mask3_fmsub_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, B, C, U, R) + +#define _mm_mask3_fmsub_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmsubss3_mask3 (A, B, C, U, R) + +#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, -(C), U, R) + +#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, -(C), U, R) + +#define _mm_mask_fnmadd_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), C, U, R) + +#define _mm_mask_fnmadd_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), C, U, R) + +#define _mm_mask3_fnmadd_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, -(B), C, U, R) + +#define _mm_mask3_fnmadd_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask3 (A, -(B), C, U, R) + +#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), C, U, R) + +#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), C, U, R) + +#define _mm_mask_fnmsub_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), -(C), U, R) + +#define _mm_mask_fnmsub_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), -(C), U, R) + +#define _mm_mask3_fnmsub_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, -(B), C, U, R) + +#define _mm_mask3_fnmsub_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmsubss3_mask3 (A, -(B), C, U, R) + +#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), -(C), U, R) + +#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), -(C), U, R) +#endif + +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_round_ss (__m128 __A, __m128 __B, const int __P, const int __R) +{ + return __builtin_ia32_vcomiss ((__v4sf) __A, (__v4sf) __B, __P, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_round_sd (__m128d __A, __m128d __B, const int __P, const int __R) +{ + return __builtin_ia32_vcomisd ((__v2df) __A, (__v2df) __B, __P, __R); +} +#else +#define _mm_comi_round_ss(A, B, C, D)\ +__builtin_ia32_vcomiss(A, B, C, D) +#define _mm_comi_round_sd(A, B, C, D)\ +__builtin_ia32_vcomisd(A, B, C, D) +#endif + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu64_ss (__m128 __A, unsigned long long __B) +{ + return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu64_sd (__m128d __A, unsigned long long __B) +{ + return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu32_ss (__m128 __A, unsigned __B) +{ + return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_sd (__m128d __A, __m128d __B, __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_sd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_sd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_ss (__m128 __A, __m128 __B, __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_ss (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm_fixupimm_sd(X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_fixupimm_sd(X, U, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_fixupimm_ss(X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_fixupimm_ss(X, U, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#endif + +#ifdef __x86_64__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_u64 (__m128 __A) +{ + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_u64 (__m128 __A) +{ + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_i64 (__m128 __A) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} +#endif /* __x86_64__ */ + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_u32 (__m128 __A) +{ + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_u32 (__m128 __A) +{ + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_i32 (__m128 __A) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_i32 (__m128d __A) +{ + return (int) __builtin_ia32_cvtsd2si ((__v2df) __A); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_i32 (__m128 __A) +{ + return (int) __builtin_ia32_cvtss2si ((__v4sf) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_sd (__m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_cvtsi2sd ((__v2df) __A, __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_ss (__m128 __A, int __B) +{ + return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); +} + +#ifdef __x86_64__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_u64 (__m128d __A) +{ + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_u64 (__m128d __A) +{ + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_i64 (__m128d __A) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_i64 (__m128d __A) +{ + return (long long) __builtin_ia32_cvtsd2si64 ((__v2df) __A); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_i64 (__m128 __A) +{ + return (long long) __builtin_ia32_cvtss2si64 ((__v4sf) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_sd (__m128d __A, long long __B) +{ + return (__m128d) __builtin_ia32_cvtsi642sd ((__v2df) __A, __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_ss (__m128 __A, long long __B) +{ + return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); +} +#endif /* __x86_64__ */ + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_u32 (__m128d __A) +{ + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_u32 (__m128d __A) +{ + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_i32 (__m128d __A) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, + (__v4sf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, + (__v2df) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_sd (__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) __W, + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_sd (__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) + _mm_setzero_pd(), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_ss (__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) __W, + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_ss (__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) + _mm_setzero_ps(), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_ss (__m128 __A, __m128 __B, const int __imm) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_ss (__m128 __A, __mmask8 __B, __m128 __C, __m128 __D, + const int __imm) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __C, + (__v4sf) __D, __imm, + (__v4sf) __A, + (__mmask8) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_ss (__mmask8 __A, __m128 __B, __m128 __C, + const int __imm) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __B, + (__v4sf) __C, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_sd (__m128d __A, __m128d __B, const int __imm) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_sd (__m128d __A, __mmask8 __B, __m128d __C, __m128d __D, + const int __imm) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __C, + (__v2df) __D, __imm, + (__v2df) __A, + (__mmask8) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_sd (__mmask8 __A, __m128d __B, __m128d __C, + const int __imm) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __B, + (__v2df) __C, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __M, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __M, + _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm_getmant_sd(X, Y, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_sd(W, U, X, Y, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_sd(U, X, Y, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getmant_ss(X, Y, C, D) \ + ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_ss(W, U, X, Y, C, D) \ + ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_ss(U, X, Y, C, D) \ + ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getexp_ss(A, B) \ + ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getexp_ss(W, U, A, B) \ + (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U,\ + _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_getexp_ss(U, A, B) \ + (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U,\ + _MM_FROUND_CUR_DIRECTION) + +#define _mm_getexp_sd(A, B) \ + ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getexp_sd(W, U, A, B) \ + (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U,\ + _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_getexp_sd(U, A, B) \ + (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U,\ + _MM_FROUND_CUR_DIRECTION) + +#define _mm_roundscale_ss(A, B, I) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ + (__v4sf) (__m128) (B), \ + (int) (I), \ + (__v4sf) _mm_setzero_ps (), \ + (__mmask8) (-1), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_roundscale_ss(A, U, B, C, I) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B), \ + (__v4sf) (__m128) (C), \ + (int) (I), \ + (__v4sf) (__m128) (A), \ + (__mmask8) (U), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_roundscale_ss(U, A, B, I) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ + (__v4sf) (__m128) (B), \ + (int) (I), \ + (__v4sf) _mm_setzero_ps (), \ + (__mmask8) (U), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_roundscale_sd(A, B, I) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ + (__v2df) (__m128d) (B), \ + (int) (I), \ + (__v2df) _mm_setzero_pd (), \ + (__mmask8) (-1), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_roundscale_sd(A, U, B, C, I) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B), \ + (__v2df) (__m128d) (C), \ + (int) (I), \ + (__v2df) (__m128d) (A), \ + (__mmask8) (U), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_roundscale_sd(U, A, B, I) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ + (__v2df) (__m128d) (B), \ + (int) (I), \ + (__v2df) _mm_setzero_pd (), \ + (__mmask8) (U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_cmp_sd_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_sd_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + M,_MM_FROUND_CUR_DIRECTION)) + +#define _mm_cmp_ss_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_ss_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + M,_MM_FROUND_CUR_DIRECTION)) + +#endif + +#ifdef __DISABLE_AVX512F__ +#undef __DISABLE_AVX512F__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512F__ */ + +#if !defined (__AVX512F__) || !defined (__EVEX512__) +#pragma GCC push_options +#pragma GCC target("avx512f,evex512") +#define __DISABLE_AVX512F_512__ +#endif /* __AVX512F_512__ */ + /* Internal data types for implementing the intrinsics. */ typedef double __v8df __attribute__ ((__vector_size__ (64))); typedef float __v16sf __attribute__ ((__vector_size__ (64))); @@ -57,9 +3799,6 @@ typedef float __m512_u __attribute__ ((__vector_size__ (64), __may_alias__, __al typedef long long __m512i_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1))); typedef double __m512d_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1))); -typedef unsigned char __mmask8; -typedef unsigned short __mmask16; - extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_int2mask (int __M) @@ -1498,174 +5237,6 @@ _mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B) (__mmask16) __U); } -#ifdef __OPTIMIZE__ -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_add_round_sd (__m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, - (__v2df) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_round_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_round_sd (__mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_add_round_ss (__m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_addss_round ((__v4sf) __A, - (__v4sf) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_round_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_round_ss (__mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sub_round_sd (__m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, - (__v2df) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_round_sd (__mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sub_round_ss (__m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_subss_round ((__v4sf) __A, - (__v4sf) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, __R); -} - -#else -#define _mm_add_round_sd(A, B, C) \ - (__m128d)__builtin_ia32_addsd_round(A, B, C) - -#define _mm_mask_add_round_sd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_addsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_add_round_sd(U, A, B, C) \ - (__m128d)__builtin_ia32_addsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_add_round_ss(A, B, C) \ - (__m128)__builtin_ia32_addss_round(A, B, C) - -#define _mm_mask_add_round_ss(W, U, A, B, C) \ - (__m128)__builtin_ia32_addss_mask_round(A, B, W, U, C) - -#define _mm_maskz_add_round_ss(U, A, B, C) \ - (__m128)__builtin_ia32_addss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#define _mm_sub_round_sd(A, B, C) \ - (__m128d)__builtin_ia32_subsd_round(A, B, C) - -#define _mm_mask_sub_round_sd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_subsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_sub_round_sd(U, A, B, C) \ - (__m128d)__builtin_ia32_subsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_sub_round_ss(A, B, C) \ - (__m128)__builtin_ia32_subss_round(A, B, C) - -#define _mm_mask_sub_round_ss(W, U, A, B, C) \ - (__m128)__builtin_ia32_subss_mask_round(A, B, W, U, C) - -#define _mm_maskz_sub_round_ss(U, A, B, C) \ - (__m128)__builtin_ia32_subss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#endif - /* Constant helper to represent the ternary logic operations among vector A, B and C. */ typedef enum @@ -1856,62 +5427,6 @@ _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) (__mmask16) __U); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rcp14_sd (__m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __B, - (__v2df) __A); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B, - (__v2df) __A, - (__v2df) __W, - (__mmask8) __U); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B, - (__v2df) __A, - (__v2df) _mm_setzero_ps (), - (__mmask8) __U); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rcp14_ss (__m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __B, - (__v4sf) __A); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B, - (__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B, - (__v4sf) __A, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U); -} - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_rsqrt14_pd (__m512d __A) @@ -1970,62 +5485,6 @@ _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) (__mmask16) __U); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rsqrt14_sd (__m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __B, - (__v2df) __A); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B, - (__v2df) __A, - (__v2df) __W, - (__mmask8) __U); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B, - (__v2df) __A, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_rsqrt14_ss (__m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __B, - (__v4sf) __A); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B, - (__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B, - (__v4sf) __A, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U); -} - #ifdef __OPTIMIZE__ extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -2086,71 +5545,6 @@ _mm512_maskz_sqrt_round_ps (__mmask16 __U, __m512 __A, const int __R) (__mmask16) __U, __R); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sqrt_round_sd (__m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, - (__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sqrt_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, - (__v2df) __A, - (__v2df) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sqrt_round_sd (__mmask8 __U, __m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, - (__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_sqrt_round_ss (__m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, - (__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sqrt_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, - (__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, - (__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, __R); -} #else #define _mm512_sqrt_round_pd(A, C) \ (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, C) @@ -2170,41 +5564,8 @@ _mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) #define _mm512_maskz_sqrt_round_ps(U, A, C) \ (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm_sqrt_round_sd(A, B, C) \ - (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \ - (__v2df) _mm_setzero_pd (), -1, C) - -#define _mm_mask_sqrt_round_sd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, W, U, C) - -#define _mm_maskz_sqrt_round_sd(U, A, B, C) \ - (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \ - (__v2df) _mm_setzero_pd (), U, C) - -#define _mm_sqrt_round_ss(A, B, C) \ - (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \ - (__v4sf) _mm_setzero_ps (), -1, C) - -#define _mm_mask_sqrt_round_ss(W, U, A, B, C) \ - (__m128)__builtin_ia32_sqrtss_mask_round (B, A, W, U, C) - -#define _mm_maskz_sqrt_round_ss(U, A, B, C) \ - (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \ - (__v4sf) _mm_setzero_ps (), U, C) #endif -#define _mm_mask_sqrt_sd(W, U, A, B) \ - _mm_mask_sqrt_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_sqrt_sd(U, A, B) \ - _mm_maskz_sqrt_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_sqrt_ss(W, U, A, B) \ - _mm_mask_sqrt_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_sqrt_ss(U, A, B) \ - _mm_maskz_sqrt_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) - extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cvtepi8_epi32 (__m128i __A) @@ -2802,134 +6163,6 @@ _mm512_maskz_div_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) (__mmask16) __U, __R); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mul_round_sd (__m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, - (__v2df) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mul_round_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mul_round_sd (__mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mul_round_ss (__m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, - (__v4sf) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mul_round_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mul_round_ss (__mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_div_round_sd (__m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, - (__v2df) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_div_round_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_div_round_sd (__mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_div_round_ss (__m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_divss_round ((__v4sf) __A, - (__v4sf) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_div_round_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_div_round_ss (__mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, __R); -} - #else #define _mm512_mul_round_pd(A, B, C) \ (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) @@ -2967,42 +6200,6 @@ _mm_maskz_div_round_ss (__mmask8 __U, __m128 __A, __m128 __B, #define _mm512_maskz_div_round_ps(U, A, B, C) \ (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) -#define _mm_mul_round_sd(A, B, C) \ - (__m128d)__builtin_ia32_mulsd_round(A, B, C) - -#define _mm_mask_mul_round_sd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_mulsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_mul_round_sd(U, A, B, C) \ - (__m128d)__builtin_ia32_mulsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_mul_round_ss(A, B, C) \ - (__m128)__builtin_ia32_mulss_round(A, B, C) - -#define _mm_mask_mul_round_ss(W, U, A, B, C) \ - (__m128)__builtin_ia32_mulss_mask_round(A, B, W, U, C) - -#define _mm_maskz_mul_round_ss(U, A, B, C) \ - (__m128)__builtin_ia32_mulss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#define _mm_div_round_sd(A, B, C) \ - (__m128d)__builtin_ia32_divsd_round(A, B, C) - -#define _mm_mask_div_round_sd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_divsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_div_round_sd(U, A, B, C) \ - (__m128d)__builtin_ia32_divsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_div_round_ss(A, B, C) \ - (__m128)__builtin_ia32_divss_round(A, B, C) - -#define _mm_mask_div_round_ss(W, U, A, B, C) \ - (__m128)__builtin_ia32_divss_mask_round(A, B, W, U, C) - -#define _mm_maskz_div_round_ss(U, A, B, C) \ - (__m128)__builtin_ia32_divss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - #endif #ifdef __OPTIMIZE__ @@ -3246,72 +6443,6 @@ _mm512_maskz_scalef_round_ps (__mmask16 __U, __m512 __A, __m512 __B, (__mmask16) __U, __R); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_scalef_round_sd (__m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_scalef_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_scalef_round_sd (__mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_scalef_round_ss (__m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_scalef_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, __R); -} #else #define _mm512_scalef_round_pd(A, B, C) \ ((__m512d) \ @@ -3343,51 +6474,8 @@ _mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) (__v16sf) _mm512_setzero_ps(), \ (U), (C))) -#define _mm_scalef_round_sd(A, B, C) \ - ((__m128d) \ - __builtin_ia32_scalefsd_mask_round ((A), (B), \ - (__v2df) _mm_undefined_pd (), \ - -1, (C))) - -#define _mm_scalef_round_ss(A, B, C) \ - ((__m128) \ - __builtin_ia32_scalefss_mask_round ((A), (B), \ - (__v4sf) _mm_undefined_ps (), \ - -1, (C))) - -#define _mm_mask_scalef_round_sd(W, U, A, B, C) \ - ((__m128d) \ - __builtin_ia32_scalefsd_mask_round ((A), (B), (W), (U), (C))) - -#define _mm_mask_scalef_round_ss(W, U, A, B, C) \ - ((__m128) \ - __builtin_ia32_scalefss_mask_round ((A), (B), (W), (U), (C))) - -#define _mm_maskz_scalef_round_sd(U, A, B, C) \ - ((__m128d) \ - __builtin_ia32_scalefsd_mask_round ((A), (B), \ - (__v2df) _mm_setzero_pd (), \ - (U), (C))) - -#define _mm_maskz_scalef_round_ss(U, A, B, C) \ - ((__m128) \ - __builtin_ia32_scalefss_mask_round ((A), (B), \ - (__v4sf) _mm_setzero_ps (), \ - (U), (C))) #endif -#define _mm_mask_scalef_sd(W, U, A, B) \ - _mm_mask_scalef_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_scalef_sd(U, A, B) \ - _mm_maskz_scalef_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_scalef_ss(W, U, A, B) \ - _mm_mask_scalef_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_scalef_ss(U, A, B) \ - _mm_maskz_scalef_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) - #ifdef __OPTIMIZE__ extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -5188,115 +8276,6 @@ _mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R) ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) #endif -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtu32_sd (__m128d __A, unsigned __B) -{ - return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B); -} - -#ifdef __x86_64__ -#ifdef __OPTIMIZE__ -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundu64_sd (__m128d __A, unsigned long long __B, const int __R) -{ - return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundi64_sd (__m128d __A, long long __B, const int __R) -{ - return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundsi64_sd (__m128d __A, long long __B, const int __R) -{ - return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); -} -#else -#define _mm_cvt_roundu64_sd(A, B, C) \ - (__m128d)__builtin_ia32_cvtusi2sd64(A, B, C) - -#define _mm_cvt_roundi64_sd(A, B, C) \ - (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) - -#define _mm_cvt_roundsi64_sd(A, B, C) \ - (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) -#endif - -#endif - -#ifdef __OPTIMIZE__ -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundu32_ss (__m128 __A, unsigned __B, const int __R) -{ - return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundsi32_ss (__m128 __A, int __B, const int __R) -{ - return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundi32_ss (__m128 __A, int __B, const int __R) -{ - return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); -} -#else -#define _mm_cvt_roundu32_ss(A, B, C) \ - (__m128)__builtin_ia32_cvtusi2ss32(A, B, C) - -#define _mm_cvt_roundi32_ss(A, B, C) \ - (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) - -#define _mm_cvt_roundsi32_ss(A, B, C) \ - (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) -#endif - -#ifdef __x86_64__ -#ifdef __OPTIMIZE__ -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundu64_ss (__m128 __A, unsigned long long __B, const int __R) -{ - return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundsi64_ss (__m128 __A, long long __B, const int __R) -{ - return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundi64_ss (__m128 __A, long long __B, const int __R) -{ - return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); -} -#else -#define _mm_cvt_roundu64_ss(A, B, C) \ - (__m128)__builtin_ia32_cvtusi2ss64(A, B, C) - -#define _mm_cvt_roundi64_ss(A, B, C) \ - (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) - -#define _mm_cvt_roundsi64_ss(A, B, C) \ - (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) -#endif - -#endif - extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cvtepi32_epi8 (__m512i __A) @@ -6394,83 +9373,6 @@ _mm512_mask_storeu_ps (void *__P, __mmask16 __U, __m512 __A) (__mmask16) __U); } -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P) -{ - return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_load_ss (__mmask8 __U, const float *__P) -{ - return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) _mm_setzero_ps (), - __U); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P) -{ - return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_load_sd (__mmask8 __U, const double *__P) -{ - return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) _mm_setzero_pd (), - __U); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B, - (__v4sf) __W, __U); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), __U); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B, - (__v2df) __W, __U); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B, - (__v2df) _mm_setzero_pd (), - __U); -} - -extern __inline void -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A) -{ - __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U); -} - -extern __inline void -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A) -{ - __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U); -} - extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_loadu_epi64 (void const *__P) @@ -7273,73 +10175,6 @@ _mm512_maskz_fixupimm_round_ps (__mmask16 __U, __m512 __A, __m512 __B, (__mmask16) __U, __R); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fixupimm_round_sd (__m128d __A, __m128d __B, __m128i __C, - const int __imm, const int __R) -{ - return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, - (__v2df) __B, - (__v2di) __C, __imm, - (__mmask8) -1, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fixupimm_round_sd (__m128d __A, __mmask8 __U, __m128d __B, - __m128i __C, const int __imm, const int __R) -{ - return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, - (__v2df) __B, - (__v2di) __C, __imm, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fixupimm_round_sd (__mmask8 __U, __m128d __A, __m128d __B, - __m128i __C, const int __imm, const int __R) -{ - return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, - (__v2df) __B, - (__v2di) __C, - __imm, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fixupimm_round_ss (__m128 __A, __m128 __B, __m128i __C, - const int __imm, const int __R) -{ - return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4si) __C, __imm, - (__mmask8) -1, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fixupimm_round_ss (__m128 __A, __mmask8 __U, __m128 __B, - __m128i __C, const int __imm, const int __R) -{ - return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4si) __C, __imm, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B, - __m128i __C, const int __imm, const int __R) -{ - return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, - (__v4sf) __B, - (__v4si) __C, __imm, - (__mmask8) __U, __R); -} - #else #define _mm512_shuffle_pd(X, Y, C) \ ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), \ @@ -7407,35 +10242,6 @@ _mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B, (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ (__mmask16)(U), (R))) -#define _mm_fixupimm_round_sd(X, Y, Z, C, R) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ - (__mmask8)(-1), (R))) - -#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ - (__mmask8)(U), (R))) - -#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R) \ - ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ - (__mmask8)(U), (R))) - -#define _mm_fixupimm_round_ss(X, Y, Z, C, R) \ - ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ - (__mmask8)(-1), (R))) - -#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R) \ - ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ - (__mmask8)(U), (R))) - -#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R) \ - ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ - (__mmask8)(U), (R))) #endif extern __inline __m512 @@ -8169,258 +10975,6 @@ _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) (__mmask8) __U); } -#ifdef __x86_64__ -#ifdef __OPTIMIZE__ -extern __inline unsigned long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundss_u64 (__m128 __A, const int __R) -{ - return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) __A, __R); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundss_si64 (__m128 __A, const int __R) -{ - return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundss_i64 (__m128 __A, const int __R) -{ - return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); -} - -extern __inline unsigned long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundss_u64 (__m128 __A, const int __R) -{ - return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) __A, __R); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundss_i64 (__m128 __A, const int __R) -{ - return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundss_si64 (__m128 __A, const int __R) -{ - return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); -} -#else -#define _mm_cvt_roundss_u64(A, B) \ - ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B)) - -#define _mm_cvt_roundss_si64(A, B) \ - ((long long)__builtin_ia32_vcvtss2si64(A, B)) - -#define _mm_cvt_roundss_i64(A, B) \ - ((long long)__builtin_ia32_vcvtss2si64(A, B)) - -#define _mm_cvtt_roundss_u64(A, B) \ - ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B)) - -#define _mm_cvtt_roundss_i64(A, B) \ - ((long long)__builtin_ia32_vcvttss2si64(A, B)) - -#define _mm_cvtt_roundss_si64(A, B) \ - ((long long)__builtin_ia32_vcvttss2si64(A, B)) -#endif -#endif - -#ifdef __OPTIMIZE__ -extern __inline unsigned -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundss_u32 (__m128 __A, const int __R) -{ - return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, __R); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundss_si32 (__m128 __A, const int __R) -{ - return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundss_i32 (__m128 __A, const int __R) -{ - return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); -} - -extern __inline unsigned -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundss_u32 (__m128 __A, const int __R) -{ - return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, __R); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundss_i32 (__m128 __A, const int __R) -{ - return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundss_si32 (__m128 __A, const int __R) -{ - return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); -} -#else -#define _mm_cvt_roundss_u32(A, B) \ - ((unsigned)__builtin_ia32_vcvtss2usi32(A, B)) - -#define _mm_cvt_roundss_si32(A, B) \ - ((int)__builtin_ia32_vcvtss2si32(A, B)) - -#define _mm_cvt_roundss_i32(A, B) \ - ((int)__builtin_ia32_vcvtss2si32(A, B)) - -#define _mm_cvtt_roundss_u32(A, B) \ - ((unsigned)__builtin_ia32_vcvttss2usi32(A, B)) - -#define _mm_cvtt_roundss_si32(A, B) \ - ((int)__builtin_ia32_vcvttss2si32(A, B)) - -#define _mm_cvtt_roundss_i32(A, B) \ - ((int)__builtin_ia32_vcvttss2si32(A, B)) -#endif - -#ifdef __x86_64__ -#ifdef __OPTIMIZE__ -extern __inline unsigned long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundsd_u64 (__m128d __A, const int __R) -{ - return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) __A, __R); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundsd_si64 (__m128d __A, const int __R) -{ - return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundsd_i64 (__m128d __A, const int __R) -{ - return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); -} - -extern __inline unsigned long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundsd_u64 (__m128d __A, const int __R) -{ - return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) __A, __R); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundsd_si64 (__m128d __A, const int __R) -{ - return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundsd_i64 (__m128d __A, const int __R) -{ - return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); -} -#else -#define _mm_cvt_roundsd_u64(A, B) \ - ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B)) - -#define _mm_cvt_roundsd_si64(A, B) \ - ((long long)__builtin_ia32_vcvtsd2si64(A, B)) - -#define _mm_cvt_roundsd_i64(A, B) \ - ((long long)__builtin_ia32_vcvtsd2si64(A, B)) - -#define _mm_cvtt_roundsd_u64(A, B) \ - ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B)) - -#define _mm_cvtt_roundsd_si64(A, B) \ - ((long long)__builtin_ia32_vcvttsd2si64(A, B)) - -#define _mm_cvtt_roundsd_i64(A, B) \ - ((long long)__builtin_ia32_vcvttsd2si64(A, B)) -#endif -#endif - -#ifdef __OPTIMIZE__ -extern __inline unsigned -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundsd_u32 (__m128d __A, const int __R) -{ - return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, __R); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundsd_si32 (__m128d __A, const int __R) -{ - return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundsd_i32 (__m128d __A, const int __R) -{ - return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); -} - -extern __inline unsigned -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundsd_u32 (__m128d __A, const int __R) -{ - return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, __R); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundsd_i32 (__m128d __A, const int __R) -{ - return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtt_roundsd_si32 (__m128d __A, const int __R) -{ - return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); -} -#else -#define _mm_cvt_roundsd_u32(A, B) \ - ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B)) - -#define _mm_cvt_roundsd_si32(A, B) \ - ((int)__builtin_ia32_vcvtsd2si32(A, B)) - -#define _mm_cvt_roundsd_i32(A, B) \ - ((int)__builtin_ia32_vcvtsd2si32(A, B)) - -#define _mm_cvtt_roundsd_u32(A, B) \ - ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B)) - -#define _mm_cvtt_roundsd_si32(A, B) \ - ((int)__builtin_ia32_vcvttsd2si32(A, B)) - -#define _mm_cvtt_roundsd_i32(A, B) \ - ((int)__builtin_ia32_vcvttsd2si32(A, B)) -#endif - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_movedup_pd (__m512d __A) @@ -8741,71 +11295,6 @@ _mm512_maskz_cvt_roundpd_ps (__mmask8 __U, __m512d __A, const int __R) (__mmask8) __U, __R); } -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R) -{ - return (__m128) __builtin_ia32_cvtsd2ss_round ((__v4sf) __A, - (__v2df) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvt_roundsd_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128d __B, const int __R) -{ - return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, - (__v2df) __B, - (__v4sf) __W, - __U, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvt_roundsd_ss (__mmask8 __U, __m128 __A, - __m128d __B, const int __R) -{ - return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, - (__v2df) __B, - _mm_setzero_ps (), - __U, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) -{ - return (__m128d) __builtin_ia32_cvtss2sd_round ((__v2df) __A, - (__v4sf) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvt_roundss_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128 __B, const int __R) -{ - return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, - (__v4sf) __B, - (__v2df) __W, - __U, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_cvt_roundss_sd (__mmask8 __U, __m128d __A, - __m128 __B, const int __R) -{ - return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, - (__v4sf) __B, - _mm_setzero_pd (), - __U, - __R); -} #else #define _mm512_cvt_roundpd_ps(A, B) \ (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B) @@ -8816,40 +11305,8 @@ _mm_maskz_cvt_roundss_sd (__mmask8 __U, __m128d __A, #define _mm512_maskz_cvt_roundpd_ps(U, A, B) \ (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B) -#define _mm_cvt_roundsd_ss(A, B, C) \ - (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C) - -#define _mm_mask_cvt_roundsd_ss(W, U, A, B, C) \ - (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), (W), (U), (C)) - -#define _mm_maskz_cvt_roundsd_ss(U, A, B, C) \ - (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), _mm_setzero_ps (), \ - (U), (C)) - -#define _mm_cvt_roundss_sd(A, B, C) \ - (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C) - -#define _mm_mask_cvt_roundss_sd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), (W), (U), (C)) - -#define _mm_maskz_cvt_roundss_sd(U, A, B, C) \ - (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), _mm_setzero_pd (), \ - (U), (C)) - #endif -#define _mm_mask_cvtss_sd(W, U, A, B) \ - _mm_mask_cvt_roundss_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_cvtss_sd(U, A, B) \ - _mm_maskz_cvt_roundss_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_cvtsd_ss(W, U, A, B) \ - _mm_mask_cvt_roundsd_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_cvtsd_ss(U, A, B) \ - _mm_maskz_cvt_roundsd_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) - extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_stream_si512 (__m512i * __P, __m512i __A) @@ -8878,87 +11335,7 @@ _mm512_stream_load_si512 (void *__P) return __builtin_ia32_movntdqa512 ((__v8di *)__P); } -/* Constants for mantissa extraction */ -typedef enum -{ - _MM_MANT_NORM_1_2, /* interval [1, 2) */ - _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */ - _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */ - _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */ -} _MM_MANTISSA_NORM_ENUM; - -typedef enum -{ - _MM_MANT_SIGN_src, /* sign = sign(SRC) */ - _MM_MANT_SIGN_zero, /* sign = 0 */ - _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */ -} _MM_MANTISSA_SIGN_ENUM; - #ifdef __OPTIMIZE__ -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getexp_round_ss (__m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, - (__v4sf) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_getexp_round_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_getexp_round_ss (__mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getexp_round_sd (__m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, - (__v2df) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_getexp_round_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_getexp_round_sd (__mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, __R); -} - extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_getexp_round_ps (__m512 __A, const int __R) @@ -9091,84 +11468,6 @@ _mm512_maskz_getmant_round_ps (__mmask16 __U, __m512 __A, __U, __R); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getmant_round_sd (__m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) -{ - return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, - (__v2df) __B, - (__D << 2) | __C, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_getmant_round_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) -{ - return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__D << 2) | __C, - (__v2df) __W, - __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_getmant_round_sd (__mmask8 __U, __m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) -{ - return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__D << 2) | __C, - (__v2df) - _mm_setzero_pd(), - __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getmant_round_ss (__m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) -{ - return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, - (__v4sf) __B, - (__D << 2) | __C, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_getmant_round_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) -{ - return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__D << 2) | __C, - (__v4sf) __W, - __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_getmant_round_ss (__mmask8 __U, __m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) -{ - return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__D << 2) | __C, - (__v4sf) - _mm_setzero_ps(), - __U, __R); -} - #else #define _mm512_getmant_round_pd(X, B, C, R) \ ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ @@ -9210,68 +11509,6 @@ _mm_maskz_getmant_round_ss (__mmask8 __U, __m128 __A, __m128 __B, (__v16sf)(__m512)_mm512_setzero_ps(), \ (__mmask16)(U),\ (R))) -#define _mm_getmant_round_sd(X, Y, C, D, R) \ - ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (int)(((D)<<2) | (C)), \ - (R))) - -#define _mm_mask_getmant_round_sd(W, U, X, Y, C, D, R) \ - ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (int)(((D)<<2) | (C)), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U),\ - (R))) - -#define _mm_maskz_getmant_round_sd(U, X, Y, C, D, R) \ - ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (int)(((D)<<2) | (C)), \ - (__v2df)(__m128d)_mm_setzero_pd(), \ - (__mmask8)(U),\ - (R))) - -#define _mm_getmant_round_ss(X, Y, C, D, R) \ - ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), \ - (int)(((D)<<2) | (C)), \ - (R))) - -#define _mm_mask_getmant_round_ss(W, U, X, Y, C, D, R) \ - ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U),\ - (R))) - -#define _mm_maskz_getmant_round_ss(U, X, Y, C, D, R) \ - ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)(__m128)_mm_setzero_ps(), \ - (__mmask8)(U),\ - (R))) - -#define _mm_getexp_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), R)) - -#define _mm_mask_getexp_round_ss(W, U, A, B, C) \ - (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U, C) - -#define _mm_maskz_getexp_round_ss(U, A, B, C) \ - (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#define _mm_getexp_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), R)) - -#define _mm_mask_getexp_round_sd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_getexp_round_sd(U, A, B, C) \ - (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - #define _mm512_getexp_round_ps(A, R) \ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ @@ -9363,88 +11600,6 @@ _mm512_maskz_roundscale_round_pd (__mmask8 __A, __m512d __B, (__mmask8) __A, __R); } -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_roundscale_round_ss (__m128 __A, __m128 __B, const int __imm, - const int __R) -{ - return (__m128) - __builtin_ia32_rndscaless_mask_round ((__v4sf) __A, - (__v4sf) __B, __imm, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_roundscale_round_ss (__m128 __A, __mmask8 __B, __m128 __C, - __m128 __D, const int __imm, const int __R) -{ - return (__m128) - __builtin_ia32_rndscaless_mask_round ((__v4sf) __C, - (__v4sf) __D, __imm, - (__v4sf) __A, - (__mmask8) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_roundscale_round_ss (__mmask8 __A, __m128 __B, __m128 __C, - const int __imm, const int __R) -{ - return (__m128) - __builtin_ia32_rndscaless_mask_round ((__v4sf) __B, - (__v4sf) __C, __imm, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __A, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_roundscale_round_sd (__m128d __A, __m128d __B, const int __imm, - const int __R) -{ - return (__m128d) - __builtin_ia32_rndscalesd_mask_round ((__v2df) __A, - (__v2df) __B, __imm, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_roundscale_round_sd (__m128d __A, __mmask8 __B, __m128d __C, - __m128d __D, const int __imm, const int __R) -{ - return (__m128d) - __builtin_ia32_rndscalesd_mask_round ((__v2df) __C, - (__v2df) __D, __imm, - (__v2df) __A, - (__mmask8) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_roundscale_round_sd (__mmask8 __A, __m128d __B, __m128d __C, - const int __imm, const int __R) -{ - return (__m128d) - __builtin_ia32_rndscalesd_mask_round ((__v2df) __B, - (__v2df) __C, __imm, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __A, - __R); -} - #else #define _mm512_roundscale_round_ps(A, B, R) \ ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\ @@ -9472,54 +11627,6 @@ _mm_maskz_roundscale_round_sd (__mmask8 __A, __m128d __B, __m128d __C, (int)(C), \ (__v8df)_mm512_setzero_pd(),\ (__mmask8)(A), R)) -#define _mm_roundscale_round_ss(A, B, I, R) \ - ((__m128) \ - __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ - (__v4sf) (__m128) (B), \ - (int) (I), \ - (__v4sf) _mm_setzero_ps (), \ - (__mmask8) (-1), \ - (int) (R))) -#define _mm_mask_roundscale_round_ss(A, U, B, C, I, R) \ - ((__m128) \ - __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B), \ - (__v4sf) (__m128) (C), \ - (int) (I), \ - (__v4sf) (__m128) (A), \ - (__mmask8) (U), \ - (int) (R))) -#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ - ((__m128) \ - __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ - (__v4sf) (__m128) (B), \ - (int) (I), \ - (__v4sf) _mm_setzero_ps (), \ - (__mmask8) (U), \ - (int) (R))) -#define _mm_roundscale_round_sd(A, B, I, R) \ - ((__m128d) \ - __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ - (__v2df) (__m128d) (B), \ - (int) (I), \ - (__v2df) _mm_setzero_pd (), \ - (__mmask8) (-1), \ - (int) (R))) -#define _mm_mask_roundscale_round_sd(A, U, B, C, I, R) \ - ((__m128d) \ - __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B), \ - (__v2df) (__m128d) (C), \ - (int) (I), \ - (__v2df) (__m128d) (A), \ - (__mmask8) (U), \ - (int) (R))) -#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ - ((__m128d) \ - __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ - (__v2df) (__m128d) (B), \ - (int) (I), \ - (__v2df) _mm_setzero_pd (), \ - (__mmask8) (U), \ - (int) (R))) #endif extern __inline __m512 @@ -10068,22 +12175,6 @@ _mm512_cmpneq_epu64_mask (__m512i __X, __m512i __Y) #define _MM_CMPINT_GT 0x6 #ifdef __OPTIMIZE__ -extern __inline __mmask16 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kshiftli_mask16 (__mmask16 __A, unsigned int __B) -{ - return (__mmask16) __builtin_ia32_kshiftlihi ((__mmask16) __A, - (__mmask8) __B); -} - -extern __inline __mmask16 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kshiftri_mask16 (__mmask16 __A, unsigned int __B) -{ - return (__mmask16) __builtin_ia32_kshiftrihi ((__mmask16) __A, - (__mmask8) __B); -} - extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cmp_epi64_mask (__m512i __X, __m512i __Y, const int __P) @@ -10199,51 +12290,7 @@ _mm512_mask_cmp_round_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, (__mmask16) __U, __R); } -extern __inline __mmask8 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmp_round_sd_mask (__m128d __X, __m128d __Y, const int __P, const int __R) -{ - return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, - (__v2df) __Y, __P, - (__mmask8) -1, __R); -} - -extern __inline __mmask8 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmp_round_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, - const int __P, const int __R) -{ - return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, - (__v2df) __Y, __P, - (__mmask8) __M, __R); -} - -extern __inline __mmask8 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmp_round_ss_mask (__m128 __X, __m128 __Y, const int __P, const int __R) -{ - return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, - (__v4sf) __Y, __P, - (__mmask8) -1, __R); -} - -extern __inline __mmask8 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, - const int __P, const int __R) -{ - return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, - (__v4sf) __Y, __P, - (__mmask8) __M, __R); -} - #else -#define _kshiftli_mask16(X, Y) \ - ((__mmask16) __builtin_ia32_kshiftlihi ((__mmask16)(X), (__mmask8)(Y))) - -#define _kshiftri_mask16(X, Y) \ - ((__mmask16) __builtin_ia32_kshiftrihi ((__mmask16)(X), (__mmask8)(Y))) - #define _mm512_cmp_epi64_mask(X, Y, P) \ ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), \ (__v8di)(__m512i)(Y), (int)(P),\ @@ -10304,25 +12351,6 @@ _mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, (__v16sf)(__m512)(Y), (int)(P),\ (__mmask16)(M), R)) -#define _mm_cmp_round_sd_mask(X, Y, P, R) \ - ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P),\ - (__mmask8)-1, R)) - -#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ - ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P),\ - (M), R)) - -#define _mm_cmp_round_ss_mask(X, Y, P, R) \ - ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)-1, R)) - -#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ - ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (M), R)) #endif #ifdef __OPTIMIZE__ @@ -11110,66 +13138,6 @@ _mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P) (), (__mmask16) __U); } -/* Mask arithmetic operations */ -#define _kand_mask16 _mm512_kand -#define _kandn_mask16 _mm512_kandn -#define _knot_mask16 _mm512_knot -#define _kor_mask16 _mm512_kor -#define _kxnor_mask16 _mm512_kxnor -#define _kxor_mask16 _mm512_kxor - -extern __inline unsigned char -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF) -{ - *__CF = (unsigned char) __builtin_ia32_kortestchi (__A, __B); - return (unsigned char) __builtin_ia32_kortestzhi (__A, __B); -} - -extern __inline unsigned char -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestz_mask16_u8 (__mmask16 __A, __mmask16 __B) -{ - return (unsigned char) __builtin_ia32_kortestzhi ((__mmask16) __A, - (__mmask16) __B); -} - -extern __inline unsigned char -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestc_mask16_u8 (__mmask16 __A, __mmask16 __B) -{ - return (unsigned char) __builtin_ia32_kortestchi ((__mmask16) __A, - (__mmask16) __B); -} - -extern __inline unsigned int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_cvtmask16_u32 (__mmask16 __A) -{ - return (unsigned int) __builtin_ia32_kmovw ((__mmask16 ) __A); -} - -extern __inline __mmask16 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_cvtu32_mask16 (unsigned int __A) -{ - return (__mmask16) __builtin_ia32_kmovw ((__mmask16 ) __A); -} - -extern __inline __mmask16 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_load_mask16 (__mmask16 *__A) -{ - return (__mmask16) __builtin_ia32_kmovw (*(__mmask16 *) __A); -} - -extern __inline void -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_store_mask16 (__mmask16 *__A, __mmask16 __B) -{ - *(__mmask16 *) __A = __builtin_ia32_kmovw (__B); -} - extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_kand (__mmask16 __A, __mmask16 __B) @@ -11236,13 +13204,6 @@ _mm512_kunpackb (__mmask16 __A, __mmask16 __B) return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } -extern __inline __mmask16 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kunpackb_mask16 (__mmask8 __A, __mmask8 __B) -{ - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); -} - #ifdef __OPTIMIZE__ extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -11593,174 +13554,6 @@ _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) (__mmask16) __U); } -#ifdef __OPTIMIZE__ -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_round_sd (__m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, - (__v2df) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_round_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_round_sd (__mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_max_round_ss (__m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, - (__v4sf) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_round_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_round_ss (__mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_round_sd (__m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, - (__v2df) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_round_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_round_sd (__mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_min_round_ss (__m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_minss_round ((__v4sf) __A, - (__v4sf) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_round_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_round_ss (__mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, __R); -} - -#else -#define _mm_max_round_sd(A, B, C) \ - (__m128d)__builtin_ia32_maxsd_round(A, B, C) - -#define _mm_mask_max_round_sd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_maxsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_max_round_sd(U, A, B, C) \ - (__m128d)__builtin_ia32_maxsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_max_round_ss(A, B, C) \ - (__m128)__builtin_ia32_maxss_round(A, B, C) - -#define _mm_mask_max_round_ss(W, U, A, B, C) \ - (__m128)__builtin_ia32_maxss_mask_round(A, B, W, U, C) - -#define _mm_maskz_max_round_ss(U, A, B, C) \ - (__m128)__builtin_ia32_maxss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#define _mm_min_round_sd(A, B, C) \ - (__m128d)__builtin_ia32_minsd_round(A, B, C) - -#define _mm_mask_min_round_sd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_minsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_min_round_sd(U, A, B, C) \ - (__m128d)__builtin_ia32_minsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_min_round_ss(A, B, C) \ - (__m128)__builtin_ia32_minss_round(A, B, C) - -#define _mm_mask_min_round_ss(W, U, A, B, C) \ - (__m128)__builtin_ia32_minss_mask_round(A, B, W, U, C) - -#define _mm_maskz_min_round_ss(U, A, B, C) \ - (__m128)__builtin_ia32_minss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#endif - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_blend_pd (__mmask8 __U, __m512d __A, __m512d __W) @@ -11797,735 +13590,6 @@ _mm512_mask_blend_epi32 (__mmask16 __U, __m512i __A, __m512i __W) (__mmask16) __U); } -#ifdef __OPTIMIZE__ -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, - (__v2df) __A, - (__v2df) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, - (__v2df) __A, - -(__v2df) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, - (__v4sf) __A, - -(__v4sf) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, - -(__v2df) __A, - -(__v2df) __B, - __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, - -(__v4sf) __A, - -(__v4sf) __B, - __R); -} -#else -#define _mm_fmadd_round_sd(A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, C, R) - -#define _mm_fmadd_round_ss(A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_round(A, B, C, R) - -#define _mm_fmsub_round_sd(A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, -(C), R) - -#define _mm_fmsub_round_ss(A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_round(A, B, -(C), R) - -#define _mm_fnmadd_round_sd(A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), C, R) - -#define _mm_fnmadd_round_ss(A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), C, R) - -#define _mm_fnmsub_round_sd(A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R) - -#define _mm_fnmsub_round_ss(A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), -(C), R) -#endif - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, - (__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, - (__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, - (__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, - (__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - (__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) -{ - return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, - (__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) -{ - return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, - (__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, - (__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, - -(__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - -(__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) -{ - return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) -{ - return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, - -(__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, - -(__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __OPTIMIZE__ -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, - (__v2df) __A, - (__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, - (__v2df) __A, - (__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, - const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, - (__v2df) __A, - (__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, - (__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - (__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, - (__v2df) __A, - (__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, - const int __R) -{ - return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, - (__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, - (__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, - const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, - -(__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fnmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - -(__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask3_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, - const int __R) -{ - return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, - const int __R) -{ - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, - -(__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, __R); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fnmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, - const int __R) -{ - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, - -(__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, __R); -} -#else -#define _mm_mask_fmadd_round_sd(A, U, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, C, U, R) - -#define _mm_mask_fmadd_round_ss(A, U, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask (A, B, C, U, R) - -#define _mm_mask3_fmadd_round_sd(A, B, C, U, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, B, C, U, R) - -#define _mm_mask3_fmadd_round_ss(A, B, C, U, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask3 (A, B, C, U, R) - -#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, C, U, R) - -#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, C, U, R) - -#define _mm_mask_fmsub_round_sd(A, U, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, -(C), U, R) - -#define _mm_mask_fmsub_round_ss(A, U, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask (A, B, -(C), U, R) - -#define _mm_mask3_fmsub_round_sd(A, B, C, U, R) \ - (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, B, C, U, R) - -#define _mm_mask3_fmsub_round_ss(A, B, C, U, R) \ - (__m128) __builtin_ia32_vfmsubss3_mask3 (A, B, C, U, R) - -#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, -(C), U, R) - -#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, -(C), U, R) - -#define _mm_mask_fnmadd_round_sd(A, U, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), C, U, R) - -#define _mm_mask_fnmadd_round_ss(A, U, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), C, U, R) - -#define _mm_mask3_fnmadd_round_sd(A, B, C, U, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, -(B), C, U, R) - -#define _mm_mask3_fnmadd_round_ss(A, B, C, U, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask3 (A, -(B), C, U, R) - -#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), C, U, R) - -#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), C, U, R) - -#define _mm_mask_fnmsub_round_sd(A, U, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), -(C), U, R) - -#define _mm_mask_fnmsub_round_ss(A, U, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), -(C), U, R) - -#define _mm_mask3_fnmsub_round_sd(A, B, C, U, R) \ - (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, -(B), C, U, R) - -#define _mm_mask3_fnmsub_round_ss(A, B, C, U, R) \ - (__m128) __builtin_ia32_vfmsubss3_mask3 (A, -(B), C, U, R) - -#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), -(C), U, R) - -#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), -(C), U, R) -#endif - -#ifdef __OPTIMIZE__ -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comi_round_ss (__m128 __A, __m128 __B, const int __P, const int __R) -{ - return __builtin_ia32_vcomiss ((__v4sf) __A, (__v4sf) __B, __P, __R); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_comi_round_sd (__m128d __A, __m128d __B, const int __P, const int __R) -{ - return __builtin_ia32_vcomisd ((__v2df) __A, (__v2df) __B, __P, __R); -} -#else -#define _mm_comi_round_ss(A, B, C, D)\ -__builtin_ia32_vcomiss(A, B, C, D) -#define _mm_comi_round_sd(A, B, C, D)\ -__builtin_ia32_vcomisd(A, B, C, D) -#endif - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_sqrt_pd (__m512d __A) @@ -12650,52 +13714,6 @@ _mm512_maskz_add_ps (__mmask16 __U, __m512 __A, __m512 __B) _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_add_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_add_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_sub_pd (__m512d __A, __m512d __B) @@ -12756,52 +13774,6 @@ _mm512_maskz_sub_ps (__mmask16 __U, __m512 __A, __m512 __B) _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_sub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_sub_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mul_pd (__m512d __A, __m512d __B) @@ -12862,54 +13834,6 @@ _mm512_maskz_mul_ps (__mmask16 __U, __m512 __A, __m512 __B) _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mul_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) -{ - return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mul_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_mul_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) -{ - return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_mul_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_div_pd (__m512d __M, __m512d __V) @@ -12970,54 +13894,6 @@ _mm512_maskz_div_ps (__mmask16 __U, __m512 __A, __m512 __B) _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_div_sd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) -{ - return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_div_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_div_ss (__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) -{ - return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_div_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_max_pd (__m512d __A, __m512d __B) @@ -13088,52 +13964,6 @@ _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_max_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_max_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_min_pd (__m512d __A, __m512d __B) @@ -13204,52 +14034,6 @@ _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_min_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_min_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_scalef_pd (__m512d __A, __m512d __B) @@ -13320,30 +14104,6 @@ _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_scalef_sd (__m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_scalef_ss (__m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_fmadd_pd (__m512d __A, __m512d __B, __m512d __C) @@ -14142,32 +14902,6 @@ _mm512_cvtss_f32 (__m512 __A) return __A[0]; } -#ifdef __x86_64__ -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtu64_ss (__m128 __A, unsigned long long __B) -{ - return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtu64_sd (__m128d __A, unsigned long long __B) -{ - return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, - _MM_FROUND_CUR_DIRECTION); -} -#endif - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtu32_ss (__m128 __A, unsigned __B) -{ - return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, - _MM_FROUND_CUR_DIRECTION); -} - extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cvtepi32_ps (__m512i __A) @@ -14309,76 +15043,6 @@ _mm512_maskz_fixupimm_ps (__mmask16 __U, __m512 __A, __m512 __B, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fixupimm_sd (__m128d __A, __m128d __B, __m128i __C, const int __imm) -{ - return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, - (__v2df) __B, - (__v2di) __C, __imm, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fixupimm_sd (__m128d __A, __mmask8 __U, __m128d __B, - __m128i __C, const int __imm) -{ - return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, - (__v2df) __B, - (__v2di) __C, __imm, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fixupimm_sd (__mmask8 __U, __m128d __A, __m128d __B, - __m128i __C, const int __imm) -{ - return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, - (__v2df) __B, - (__v2di) __C, - __imm, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_fixupimm_ss (__m128 __A, __m128 __B, __m128i __C, const int __imm) -{ - return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4si) __C, __imm, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_fixupimm_ss (__m128 __A, __mmask8 __U, __m128 __B, - __m128i __C, const int __imm) -{ - return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4si) __C, __imm, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B, - __m128i __C, const int __imm) -{ - return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, - (__v4sf) __B, - (__v4si) __C, __imm, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} #else #define _mm512_fixupimm_pd(X, Y, Z, C) \ ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \ @@ -14410,65 +15074,8 @@ _mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B, (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_fixupimm_sd(X, Y, Z, C) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ - (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_fixupimm_sd(X, U, Y, Z, C) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ - (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C) \ - ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ - (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_fixupimm_ss(X, Y, Z, C) \ - ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ - (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_fixupimm_ss(X, U, Y, Z, C) \ - ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ - (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C) \ - ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ - (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) #endif -#ifdef __x86_64__ -extern __inline unsigned long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_u64 (__m128 __A) -{ - return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) - __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline unsigned long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_u64 (__m128 __A) -{ - return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) - __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_i64 (__m128 __A) -{ - return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, - _MM_FROUND_CUR_DIRECTION); -} -#endif /* __x86_64__ */ - extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cvtsi512_si32 (__m512i __A) @@ -14477,138 +15084,6 @@ _mm512_cvtsi512_si32 (__m512i __A) return __B[0]; } -extern __inline unsigned -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_u32 (__m128 __A) -{ - return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline unsigned -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_u32 (__m128 __A) -{ - return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttss_i32 (__m128 __A) -{ - return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_i32 (__m128d __A) -{ - return (int) __builtin_ia32_cvtsd2si ((__v2df) __A); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_i32 (__m128 __A) -{ - return (int) __builtin_ia32_cvtss2si ((__v4sf) __A); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvti32_sd (__m128d __A, int __B) -{ - return (__m128d) __builtin_ia32_cvtsi2sd ((__v2df) __A, __B); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvti32_ss (__m128 __A, int __B) -{ - return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); -} - -#ifdef __x86_64__ -extern __inline unsigned long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_u64 (__m128d __A) -{ - return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) - __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline unsigned long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_u64 (__m128d __A) -{ - return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) - __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_i64 (__m128d __A) -{ - return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_i64 (__m128d __A) -{ - return (long long) __builtin_ia32_cvtsd2si64 ((__v2df) __A); -} - -extern __inline long long -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtss_i64 (__m128 __A) -{ - return (long long) __builtin_ia32_cvtss2si64 ((__v4sf) __A); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvti64_sd (__m128d __A, long long __B) -{ - return (__m128d) __builtin_ia32_cvtsi642sd ((__v2df) __A, __B); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvti64_ss (__m128 __A, long long __B) -{ - return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); -} -#endif /* __x86_64__ */ - -extern __inline unsigned -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtsd_u32 (__m128d __A) -{ - return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline unsigned -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_u32 (__m128d __A) -{ - return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline int -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvttsd_i32 (__m128d __A) -{ - return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, - _MM_FROUND_CUR_DIRECTION); -} - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cvtps_pd (__m256 __A) @@ -14770,70 +15245,6 @@ _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getexp_ss (__m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, - (__v4sf) __B, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getexp_sd (__m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, - (__v2df) __B, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_getmant_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B, @@ -14906,82 +15317,6 @@ _mm512_maskz_getmant_ps (__mmask16 __U, __m512 __A, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getmant_sd (__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) -{ - return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, - (__v2df) __B, - (__D << 2) | __C, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_getmant_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) -{ - return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__D << 2) | __C, - (__v2df) __W, - __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_getmant_sd (__mmask8 __U, __m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) -{ - return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, - (__v2df) __B, - (__D << 2) | __C, - (__v2df) - _mm_setzero_pd(), - __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_getmant_ss (__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) -{ - return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, - (__v4sf) __B, - (__D << 2) | __C, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_getmant_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) -{ - return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__D << 2) | __C, - (__v4sf) __W, - __U, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_getmant_ss (__mmask8 __U, __m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) -{ - return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, - (__v4sf) __B, - (__D << 2) | __C, - (__v4sf) - _mm_setzero_ps(), - __U, - _MM_FROUND_CUR_DIRECTION); -} - #else #define _mm512_getmant_pd(X, B, C) \ ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ @@ -15023,74 +15358,6 @@ _mm_maskz_getmant_ss (__mmask8 __U, __m128 __A, __m128 __B, (__v16sf)_mm512_setzero_ps(), \ (__mmask16)(U),\ _MM_FROUND_CUR_DIRECTION)) -#define _mm_getmant_sd(X, Y, C, D) \ - ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (int)(((D)<<2) | (C)), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getmant_sd(W, U, X, Y, C, D) \ - ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (int)(((D)<<2) | (C)), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U),\ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_getmant_sd(U, X, Y, C, D) \ - ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (int)(((D)<<2) | (C)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U),\ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_getmant_ss(X, Y, C, D) \ - ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), \ - (int)(((D)<<2) | (C)), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getmant_ss(W, U, X, Y, C, D) \ - ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U),\ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_getmant_ss(U, X, Y, C, D) \ - ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U),\ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_getexp_ss(A, B) \ - ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getexp_ss(W, U, A, B) \ - (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U,\ - _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_getexp_ss(U, A, B) \ - (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U,\ - _MM_FROUND_CUR_DIRECTION) - -#define _mm_getexp_sd(A, B) \ - ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B),\ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getexp_sd(W, U, A, B) \ - (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U,\ - _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_getexp_sd(U, A, B) \ - (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U,\ - _MM_FROUND_CUR_DIRECTION) - #define _mm512_getexp_ps(A) \ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) @@ -15185,87 +15452,6 @@ _mm512_maskz_roundscale_pd (__mmask8 __A, __m512d __B, const int __imm) _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_roundscale_ss (__m128 __A, __m128 __B, const int __imm) -{ - return (__m128) - __builtin_ia32_rndscaless_mask_round ((__v4sf) __A, - (__v4sf) __B, __imm, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_roundscale_ss (__m128 __A, __mmask8 __B, __m128 __C, __m128 __D, - const int __imm) -{ - return (__m128) - __builtin_ia32_rndscaless_mask_round ((__v4sf) __C, - (__v4sf) __D, __imm, - (__v4sf) __A, - (__mmask8) __B, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_roundscale_ss (__mmask8 __A, __m128 __B, __m128 __C, - const int __imm) -{ - return (__m128) - __builtin_ia32_rndscaless_mask_round ((__v4sf) __B, - (__v4sf) __C, __imm, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __A, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_roundscale_sd (__m128d __A, __m128d __B, const int __imm) -{ - return (__m128d) - __builtin_ia32_rndscalesd_mask_round ((__v2df) __A, - (__v2df) __B, __imm, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_roundscale_sd (__m128d __A, __mmask8 __B, __m128d __C, __m128d __D, - const int __imm) -{ - return (__m128d) - __builtin_ia32_rndscalesd_mask_round ((__v2df) __C, - (__v2df) __D, __imm, - (__v2df) __A, - (__mmask8) __B, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __m128d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_maskz_roundscale_sd (__mmask8 __A, __m128d __B, __m128d __C, - const int __imm) -{ - return (__m128d) - __builtin_ia32_rndscalesd_mask_round ((__v2df) __B, - (__v2df) __C, __imm, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __A, - _MM_FROUND_CUR_DIRECTION); -} - #else #define _mm512_roundscale_ps(A, B) \ ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\ @@ -15293,54 +15479,6 @@ _mm_maskz_roundscale_sd (__mmask8 __A, __m128d __B, __m128d __C, (int)(C), \ (__v8df)_mm512_setzero_pd(),\ (__mmask8)(A), _MM_FROUND_CUR_DIRECTION)) -#define _mm_roundscale_ss(A, B, I) \ - ((__m128) \ - __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ - (__v4sf) (__m128) (B), \ - (int) (I), \ - (__v4sf) _mm_setzero_ps (), \ - (__mmask8) (-1), \ - _MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_roundscale_ss(A, U, B, C, I) \ - ((__m128) \ - __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B), \ - (__v4sf) (__m128) (C), \ - (int) (I), \ - (__v4sf) (__m128) (A), \ - (__mmask8) (U), \ - _MM_FROUND_CUR_DIRECTION)) -#define _mm_maskz_roundscale_ss(U, A, B, I) \ - ((__m128) \ - __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ - (__v4sf) (__m128) (B), \ - (int) (I), \ - (__v4sf) _mm_setzero_ps (), \ - (__mmask8) (U), \ - _MM_FROUND_CUR_DIRECTION)) -#define _mm_roundscale_sd(A, B, I) \ - ((__m128d) \ - __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ - (__v2df) (__m128d) (B), \ - (int) (I), \ - (__v2df) _mm_setzero_pd (), \ - (__mmask8) (-1), \ - _MM_FROUND_CUR_DIRECTION)) -#define _mm_mask_roundscale_sd(A, U, B, C, I) \ - ((__m128d) \ - __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B), \ - (__v2df) (__m128d) (C), \ - (int) (I), \ - (__v2df) (__m128d) (A), \ - (__mmask8) (U), \ - _MM_FROUND_CUR_DIRECTION)) -#define _mm_maskz_roundscale_sd(U, A, B, I) \ - ((__m128d) \ - __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ - (__v2df) (__m128d) (B), \ - (int) (I), \ - (__v2df) _mm_setzero_pd (), \ - (__mmask8) (U), \ - _MM_FROUND_CUR_DIRECTION)) #endif #ifdef __OPTIMIZE__ @@ -15384,46 +15522,6 @@ _mm512_mask_cmp_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, const int __P) _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P) -{ - return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, - (__v2df) __Y, __P, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __mmask8 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P) -{ - return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, - (__v2df) __Y, __P, - (__mmask8) __M, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __mmask8 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P) -{ - return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, - (__v4sf) __Y, __P, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -extern __inline __mmask8 -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P) -{ - return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, - (__v4sf) __Y, __P, - (__mmask8) __M, - _MM_FROUND_CUR_DIRECTION); -} - #else #define _mm512_cmp_pd_mask(X, Y, P) \ ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \ @@ -15445,25 +15543,6 @@ _mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P) (__v16sf)(__m512)(Y), (int)(P),\ (__mmask16)(M),_MM_FROUND_CUR_DIRECTION)) -#define _mm_cmp_sd_mask(X, Y, P) \ - ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P),\ - (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_cmp_sd_mask(M, X, Y, P) \ - ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P),\ - M,_MM_FROUND_CUR_DIRECTION)) - -#define _mm_cmp_ss_mask(X, Y, P) \ - ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_cmp_ss_mask(M, X, Y, P) \ - ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - M,_MM_FROUND_CUR_DIRECTION)) #endif extern __inline __mmask8 @@ -16493,9 +16572,9 @@ _mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A) #undef __MM512_REDUCE_OP -#ifdef __DISABLE_AVX512F__ -#undef __DISABLE_AVX512F__ +#ifdef __DISABLE_AVX512F_512__ +#undef __DISABLE_AVX512F_512__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512F__ */ +#endif /* __DISABLE_AVX512F_512__ */ #endif /* _AVX512FINTRIN_H_INCLUDED */ |