diff options
Diffstat (limited to 'clang/lib/Headers')
33 files changed, 1893 insertions, 2073 deletions
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h index 5cfa3d0..fcc2075 100644 --- a/clang/lib/Headers/arm_acle.h +++ b/clang/lib/Headers/arm_acle.h @@ -55,11 +55,37 @@ __chkfeat(uint64_t __features) { /* 7.5 Swap */ static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) __swp(uint32_t __x, volatile uint32_t *__p) { - uint32_t v; - do - v = __builtin_arm_ldrex(__p); - while (__builtin_arm_strex(__x, __p)); - return v; + uint32_t __v; +#if (__ARM_FEATURE_LDREX & 4) || __ARM_ARCH_6M__ || __linux__ + /* + * Using this clang builtin is sensible in most situations. Where + * LDREX and STREX are available, it will compile to a loop using + * them. Otherwise it will compile to a libcall, requiring the + * runtime to provide that library function. + * + * That's unavoidable on Armv6-M, which has no atomic instructions + * at all (not even SWP), so in that situation the user will just + * have to provide an implementation of __atomic_exchange_4 (perhaps + * it would temporarily disable interrupts, and then do a separate + * load and store). + * + * We also use the libcall strategy on pre-Armv7 Linux targets, on + * the theory that Linux's runtime support library _will_ provide a + * suitable libcall, and it's better to use that than the SWP + * instruction because then when the same binary is run on a later + * Linux system the libcall implementation will use LDREX instead. + */ + __v = __atomic_exchange_n(__p, __x, __ATOMIC_RELAXED); +#else + /* + * But for older Arm architectures when the target is not Linux, we + * fall back to using the SWP instruction via inline assembler. ACLE + * is clear that we're allowed to do this, but shouldn't do it if we + * have a better alternative. + */ + __asm__("swp %0, %1, [%2]" : "=r"(__v) : "r"(__x), "r"(__p) : "memory"); +#endif + return __v; } /* 7.6 Memory prefetch intrinsics */ diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h index 75290d2..95e9bd7a 100644 --- a/clang/lib/Headers/avx10_2_512bf16intrin.h +++ b/clang/lib/Headers/avx10_2_512bf16intrin.h @@ -441,8 +441,8 @@ _mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) { static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { - return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B, - (__v32bf)__C); + return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, (__v32bf)__B, + (__v32bf)__C); } static __inline__ __m512bh __DEFAULT_FN_ATTRS512 @@ -469,8 +469,8 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pbh( static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { - return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B, - -(__v32bf)__C); + return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, (__v32bf)__B, + -(__v32bf)__C); } static __inline__ __m512bh __DEFAULT_FN_ATTRS512 @@ -497,8 +497,8 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pbh( static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { - return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B, - (__v32bf)__C); + return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, -(__v32bf)__B, + (__v32bf)__C); } static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_pbh( @@ -527,8 +527,8 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pbh( static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { - return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B, - -(__v32bf)__C); + return (__m512bh)__builtin_elementwise_fma((__v32bf)__A, -(__v32bf)__B, + -(__v32bf)__C); } static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_pbh( diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h index 66797ae..0c7f381 100644 --- a/clang/lib/Headers/avx10_2bf16intrin.h +++ b/clang/lib/Headers/avx10_2bf16intrin.h @@ -852,8 +852,8 @@ _mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) { static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_fmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { - return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, (__v16bf)__B, - (__v16bf)__C); + return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, (__v16bf)__B, + (__v16bf)__C); } static __inline__ __m256bh __DEFAULT_FN_ATTRS256 @@ -880,8 +880,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pbh( static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_fmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { - return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, (__v16bf)__B, - -(__v16bf)__C); + return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, (__v16bf)__B, + -(__v16bf)__C); } static __inline__ __m256bh __DEFAULT_FN_ATTRS256 @@ -908,8 +908,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_pbh( static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_fnmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { - return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, -(__v16bf)__B, - (__v16bf)__C); + return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, -(__v16bf)__B, + (__v16bf)__C); } static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_pbh( @@ -938,8 +938,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_pbh( static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_fnmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { - return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, -(__v16bf)__B, - -(__v16bf)__C); + return (__m256bh)__builtin_elementwise_fma((__v16bf)__A, -(__v16bf)__B, + -(__v16bf)__C); } static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_pbh( @@ -969,8 +969,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_pbh( static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmadd_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { - return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, (__v8bf)__B, - (__v8bf)__C); + return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, (__v8bf)__B, + (__v8bf)__C); } static __inline__ __m128bh __DEFAULT_FN_ATTRS128 @@ -997,8 +997,8 @@ _mm_maskz_fmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmsub_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { - return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, (__v8bf)__B, - -(__v8bf)__C); + return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, (__v8bf)__B, + -(__v8bf)__C); } static __inline__ __m128bh __DEFAULT_FN_ATTRS128 @@ -1025,8 +1025,8 @@ _mm_maskz_fmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmadd_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { - return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, -(__v8bf)__B, - (__v8bf)__C); + return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, -(__v8bf)__B, + (__v8bf)__C); } static __inline__ __m128bh __DEFAULT_FN_ATTRS128 @@ -1053,8 +1053,8 @@ _mm_maskz_fnmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmsub_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { - return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, -(__v8bf)__B, - -(__v8bf)__C); + return (__m128bh)__builtin_elementwise_fma((__v8bf)__A, -(__v8bf)__B, + -(__v8bf)__C); } static __inline__ __m128bh __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index dc9fc07..384faa3 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -31,6 +31,14 @@ __min_vector_width__(128))) #endif +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#else +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#endif + /* SSE4 Multiple Packed Sums of Absolute Difference. */ /// Computes sixteen sum of absolute difference (SAD) operations on sets of /// four unsigned 8-bit integers from the 256-bit integer vectors \a X and @@ -104,10 +112,9 @@ /// \param __a /// A 256-bit integer vector. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_abs_epi8(__m256i __a) -{ - return (__m256i)__builtin_elementwise_abs((__v32qs)__a); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_abs_epi8(__m256i __a) { + return (__m256i)__builtin_elementwise_abs((__v32qs)__a); } /// Computes the absolute value of each signed 16-bit element in the 256-bit @@ -121,10 +128,9 @@ _mm256_abs_epi8(__m256i __a) /// \param __a /// A 256-bit vector of [16 x i16]. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_abs_epi16(__m256i __a) -{ - return (__m256i)__builtin_elementwise_abs((__v16hi)__a); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_abs_epi16(__m256i __a) { + return (__m256i)__builtin_elementwise_abs((__v16hi)__a); } /// Computes the absolute value of each signed 32-bit element in the 256-bit @@ -138,10 +144,9 @@ _mm256_abs_epi16(__m256i __a) /// \param __a /// A 256-bit vector of [8 x i32]. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_abs_epi32(__m256i __a) -{ - return (__m256i)__builtin_elementwise_abs((__v8si)__a); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_abs_epi32(__m256i __a) { + return (__m256i)__builtin_elementwise_abs((__v8si)__a); } /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit @@ -359,9 +364,8 @@ _mm256_add_epi64(__m256i __a, __m256i __b) /// \param __b /// A 256-bit integer vector containing one of the source operands. /// \returns A 256-bit integer vector containing the sums. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_adds_epi8(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_adds_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b); } @@ -377,9 +381,8 @@ _mm256_adds_epi8(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the sums. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_adds_epi16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_adds_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b); } @@ -396,9 +399,8 @@ _mm256_adds_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit integer vector containing one of the source operands. /// \returns A 256-bit integer vector containing the sums. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_adds_epu8(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_adds_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b); } @@ -414,9 +416,8 @@ _mm256_adds_epu8(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the sums. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_adds_epu16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_adds_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b); } @@ -460,7 +461,7 @@ _mm256_adds_epu16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit integer vector. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_and_si256(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a & (__v4du)__b); @@ -478,7 +479,7 @@ _mm256_and_si256(__m256i __a, __m256i __b) /// \param __b /// A 256-bit integer vector. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_andnot_si256(__m256i __a, __m256i __b) { return (__m256i)(~(__v4du)__a & (__v4du)__b); @@ -633,7 +634,7 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) /// \param __b /// A 256-bit integer vector containing one of the inputs. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi8(__m256i __a, __m256i __b) { return (__m256i)((__v32qi)__a == (__v32qi)__b); @@ -659,7 +660,7 @@ _mm256_cmpeq_epi8(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the inputs. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hi)__a == (__v16hi)__b); @@ -685,7 +686,7 @@ _mm256_cmpeq_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [8 x i32] containing one of the inputs. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi32(__m256i __a, __m256i __b) { return (__m256i)((__v8si)__a == (__v8si)__b); @@ -711,7 +712,7 @@ _mm256_cmpeq_epi32(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [4 x i64] containing one of the inputs. /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpeq_epi64(__m256i __a, __m256i __b) { return (__m256i)((__v4di)__a == (__v4di)__b); @@ -737,7 +738,7 @@ _mm256_cmpeq_epi64(__m256i __a, __m256i __b) /// \param __b /// A 256-bit integer vector containing one of the inputs. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi8(__m256i __a, __m256i __b) { /* This function always performs a signed comparison, but __v32qi is a char @@ -765,7 +766,7 @@ _mm256_cmpgt_epi8(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the inputs. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hi)__a > (__v16hi)__b); @@ -791,7 +792,7 @@ _mm256_cmpgt_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [8 x i32] containing one of the inputs. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi32(__m256i __a, __m256i __b) { return (__m256i)((__v8si)__a > (__v8si)__b); @@ -817,7 +818,7 @@ _mm256_cmpgt_epi32(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [4 x i64] containing one of the inputs. /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmpgt_epi64(__m256i __a, __m256i __b) { return (__m256i)((__v4di)__a > (__v4di)__b); @@ -1363,9 +1364,8 @@ _mm256_movemask_epi8(__m256i __a) /// A 128-bit integer vector containing the source bytes. /// \returns A 256-bit vector of [16 x i16] containing the sign-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi8_epi16(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepi8_epi16(__m128i __V) { /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi); @@ -1391,9 +1391,8 @@ _mm256_cvtepi8_epi16(__m128i __V) /// A 128-bit integer vector containing the source bytes. /// \returns A 256-bit vector of [8 x i32] containing the sign-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi8_epi32(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepi8_epi32(__m128i __V) { /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); @@ -1418,9 +1417,8 @@ _mm256_cvtepi8_epi32(__m128i __V) /// A 128-bit integer vector containing the source bytes. /// \returns A 256-bit vector of [4 x i64] containing the sign-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi8_epi64(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepi8_epi64(__m128i __V) { /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di); @@ -1446,9 +1444,8 @@ _mm256_cvtepi8_epi64(__m128i __V) /// A 128-bit vector of [8 x i16] containing the source values. /// \returns A 256-bit vector of [8 x i32] containing the sign-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi16_epi32(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepi16_epi32(__m128i __V) { return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si); } @@ -1471,9 +1468,8 @@ _mm256_cvtepi16_epi32(__m128i __V) /// A 128-bit vector of [8 x i16] containing the source values. /// \returns A 256-bit vector of [4 x i64] containing the sign-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi16_epi64(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepi16_epi64(__m128i __V) { return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di); } @@ -1496,9 +1492,8 @@ _mm256_cvtepi16_epi64(__m128i __V) /// A 128-bit vector of [4 x i32] containing the source values. /// \returns A 256-bit vector of [4 x i64] containing the sign-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi32_epi64(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepi32_epi64(__m128i __V) { return (__m256i)__builtin_convertvector((__v4si)__V, __v4di); } @@ -1522,9 +1517,8 @@ _mm256_cvtepi32_epi64(__m128i __V) /// A 128-bit integer vector containing the source bytes. /// \returns A 256-bit vector of [16 x i16] containing the zero-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu8_epi16(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepu8_epi16(__m128i __V) { return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi); } @@ -1548,9 +1542,8 @@ _mm256_cvtepu8_epi16(__m128i __V) /// A 128-bit integer vector containing the source bytes. /// \returns A 256-bit vector of [8 x i32] containing the zero-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu8_epi32(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepu8_epi32(__m128i __V) { return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); } @@ -1573,9 +1566,8 @@ _mm256_cvtepu8_epi32(__m128i __V) /// A 128-bit integer vector containing the source bytes. /// \returns A 256-bit vector of [4 x i64] containing the zero-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu8_epi64(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepu8_epi64(__m128i __V) { return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di); } @@ -1599,9 +1591,8 @@ _mm256_cvtepu8_epi64(__m128i __V) /// A 128-bit vector of [8 x i16] containing the source values. /// \returns A 256-bit vector of [8 x i32] containing the zero-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu16_epi32(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepu16_epi32(__m128i __V) { return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si); } @@ -1624,9 +1615,8 @@ _mm256_cvtepu16_epi32(__m128i __V) /// A 128-bit vector of [8 x i16] containing the source values. /// \returns A 256-bit vector of [4 x i64] containing the zero-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu16_epi64(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepu16_epi64(__m128i __V) { return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di); } @@ -1649,9 +1639,8 @@ _mm256_cvtepu16_epi64(__m128i __V) /// A 128-bit vector of [4 x i32] containing the source values. /// \returns A 256-bit vector of [4 x i64] containing the zero-extended /// values. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu32_epi64(__m128i __V) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtepu32_epi64(__m128i __V) { return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); } @@ -1675,9 +1664,8 @@ _mm256_cvtepu32_epi64(__m128i __V) /// \param __b /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [4 x i64] containing the products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mul_epi32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mul_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); } @@ -1721,10 +1709,10 @@ _mm256_mulhrs_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epu16(__m256i __a, __m256i __b) { - return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); + return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b); } /// Multiplies signed 16-bit integer elements of two 256-bit vectors of @@ -1740,7 +1728,7 @@ _mm256_mulhi_epu16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); @@ -1759,7 +1747,7 @@ _mm256_mulhi_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hu)__a * (__v16hu)__b); @@ -1804,9 +1792,8 @@ _mm256_mullo_epi32 (__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [4 x i64] containing the products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mul_epu32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mul_epu32(__m256i __a, __m256i __b) { return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); } @@ -1822,7 +1809,7 @@ _mm256_mul_epu32(__m256i __a, __m256i __b) /// \param __b /// A 256-bit integer vector. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_or_si256(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a | (__v4du)__b); @@ -2134,9 +2121,8 @@ _mm256_sign_epi32(__m256i __a, __m256i __b) /// \param __count /// An unsigned integer value specifying the shift count (in bits). /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_slli_epi16(__m256i __a, int __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_slli_epi16(__m256i __a, int __count) { return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); } @@ -2174,9 +2160,8 @@ _mm256_sll_epi16(__m256i __a, __m128i __count) /// \param __count /// An unsigned integer value specifying the shift count (in bits). /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_slli_epi32(__m256i __a, int __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_slli_epi32(__m256i __a, int __count) { return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); } @@ -2214,9 +2199,8 @@ _mm256_sll_epi32(__m256i __a, __m128i __count) /// \param __count /// An unsigned integer value specifying the shift count (in bits). /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_slli_epi64(__m256i __a, int __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_slli_epi64(__m256i __a, int __count) { return __builtin_ia32_psllqi256((__v4di)__a, __count); } @@ -2255,9 +2239,8 @@ _mm256_sll_epi64(__m256i __a, __m128i __count) /// \param __count /// An unsigned integer value specifying the shift count (in bits). /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srai_epi16(__m256i __a, int __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srai_epi16(__m256i __a, int __count) { return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); } @@ -2297,9 +2280,8 @@ _mm256_sra_epi16(__m256i __a, __m128i __count) /// \param __count /// An unsigned integer value specifying the shift count (in bits). /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srai_epi32(__m256i __a, int __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srai_epi32(__m256i __a, int __count) { return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); } @@ -2378,9 +2360,8 @@ _mm256_sra_epi32(__m256i __a, __m128i __count) /// \param __count /// An unsigned integer value specifying the shift count (in bits). /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srli_epi16(__m256i __a, int __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srli_epi16(__m256i __a, int __count) { return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); } @@ -2418,9 +2399,8 @@ _mm256_srl_epi16(__m256i __a, __m128i __count) /// \param __count /// An unsigned integer value specifying the shift count (in bits). /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srli_epi32(__m256i __a, int __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srli_epi32(__m256i __a, int __count) { return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); } @@ -2458,9 +2438,8 @@ _mm256_srl_epi32(__m256i __a, __m128i __count) /// \param __count /// An unsigned integer value specifying the shift count (in bits). /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srli_epi64(__m256i __a, int __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srli_epi64(__m256i __a, int __count) { return __builtin_ia32_psrlqi256((__v4di)__a, __count); } @@ -2611,9 +2590,8 @@ _mm256_sub_epi64(__m256i __a, __m256i __b) /// \param __b /// A 256-bit integer vector containing the subtrahends. /// \returns A 256-bit integer vector containing the differences. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_subs_epi8(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_subs_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b); } @@ -2637,9 +2615,8 @@ _mm256_subs_epi8(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing the subtrahends. /// \returns A 256-bit vector of [16 x i16] containing the differences. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_subs_epi16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_subs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b); } @@ -2664,9 +2641,8 @@ _mm256_subs_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit integer vector containing the subtrahends. /// \returns A 256-bit integer vector containing the differences. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_subs_epu8(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_subs_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b); } @@ -2690,9 +2666,8 @@ _mm256_subs_epu8(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing the subtrahends. /// \returns A 256-bit vector of [16 x i16] containing the differences. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_subs_epu16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_subs_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b); } @@ -2724,9 +2699,8 @@ _mm256_subs_epu16(__m256i __a, __m256i __b) /// A 256-bit integer vector used as the source for the odd-numbered bytes /// of the result. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpackhi_epi8(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_unpackhi_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); } @@ -2759,9 +2733,8 @@ _mm256_unpackhi_epi8(__m256i __a, __m256i __b) /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered /// elements of the result. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpackhi_epi16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_unpackhi_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); } @@ -2793,9 +2766,8 @@ _mm256_unpackhi_epi16(__m256i __a, __m256i __b) /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered /// elements of the result. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpackhi_epi32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_unpackhi_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); } @@ -2823,9 +2795,8 @@ _mm256_unpackhi_epi32(__m256i __a, __m256i __b) /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered /// elements of the result. /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpackhi_epi64(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_unpackhi_epi64(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3); } @@ -2857,9 +2828,8 @@ _mm256_unpackhi_epi64(__m256i __a, __m256i __b) /// A 256-bit integer vector used as the source for the odd-numbered bytes /// of the result. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpacklo_epi8(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_unpacklo_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); } @@ -2892,9 +2862,8 @@ _mm256_unpacklo_epi8(__m256i __a, __m256i __b) /// A 256-bit vector of [16 x i16] used as the source for the odd-numbered /// elements of the result. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpacklo_epi16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_unpacklo_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); } @@ -2926,9 +2895,8 @@ _mm256_unpacklo_epi16(__m256i __a, __m256i __b) /// A 256-bit vector of [8 x i32] used as the source for the odd-numbered /// elements of the result. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpacklo_epi32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_unpacklo_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); } @@ -2956,9 +2924,8 @@ _mm256_unpacklo_epi32(__m256i __a, __m256i __b) /// A 256-bit vector of [4 x i64] used as the source for the odd-numbered /// elements of the result. /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpacklo_epi64(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_unpacklo_epi64(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); } @@ -2974,7 +2941,7 @@ _mm256_unpacklo_epi64(__m256i __a, __m256i __b) /// \param __b /// A 256-bit integer vector. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_xor_si256(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a ^ (__v4du)__b); @@ -3009,9 +2976,8 @@ _mm256_stream_load_si256(const void *__V) /// \param __X /// A 128-bit vector of [4 x float] whose low element will be broadcast. /// \returns A 128-bit vector of [4 x float] containing the result. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_broadcastss_ps(__m128 __X) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_broadcastss_ps(__m128 __X) { return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0); } @@ -3026,9 +2992,8 @@ _mm_broadcastss_ps(__m128 __X) /// \param __a /// A 128-bit vector of [2 x double] whose low element will be broadcast. /// \returns A 128-bit vector of [2 x double] containing the result. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_broadcastsd_pd(__m128d __a) -{ +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_broadcastsd_pd(__m128d __a) { return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); } @@ -3043,9 +3008,8 @@ _mm_broadcastsd_pd(__m128d __a) /// \param __X /// A 128-bit vector of [4 x float] whose low element will be broadcast. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_broadcastss_ps(__m128 __X) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcastss_ps(__m128 __X) { return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0); } @@ -3060,9 +3024,8 @@ _mm256_broadcastss_ps(__m128 __X) /// \param __X /// A 128-bit vector of [2 x double] whose low element will be broadcast. /// \returns A 256-bit vector of [4 x double] containing the result. -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_broadcastsd_pd(__m128d __X) -{ +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcastsd_pd(__m128d __X) { return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0); } @@ -3076,9 +3039,8 @@ _mm256_broadcastsd_pd(__m128d __X) /// \param __X /// A 128-bit integer vector to be broadcast. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastsi128_si256(__m128i __X) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcastsi128_si256(__m128i __X) { return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); } @@ -3168,9 +3130,8 @@ _mm256_broadcastsi128_si256(__m128i __X) /// \param __X /// A 128-bit integer vector whose low byte will be broadcast. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastb_epi8(__m128i __X) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcastb_epi8(__m128i __X) { return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } @@ -3184,9 +3145,8 @@ _mm256_broadcastb_epi8(__m128i __X) /// \param __X /// A 128-bit vector of [8 x i16] whose low element will be broadcast. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastw_epi16(__m128i __X) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcastw_epi16(__m128i __X) { return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } @@ -3200,9 +3160,8 @@ _mm256_broadcastw_epi16(__m128i __X) /// \param __X /// A 128-bit vector of [4 x i32] whose low element will be broadcast. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastd_epi32(__m128i __X) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcastd_epi32(__m128i __X) { return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0); } @@ -3216,9 +3175,8 @@ _mm256_broadcastd_epi32(__m128i __X) /// \param __X /// A 128-bit vector of [2 x i64] whose low element will be broadcast. /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastq_epi64(__m128i __X) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcastq_epi64(__m128i __X) { return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0); } @@ -3232,9 +3190,8 @@ _mm256_broadcastq_epi64(__m128i __X) /// \param __X /// A 128-bit integer vector whose low byte will be broadcast. /// \returns A 128-bit integer vector containing the result. -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcastb_epi8(__m128i __X) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_broadcastb_epi8(__m128i __X) { return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } @@ -3248,9 +3205,8 @@ _mm_broadcastb_epi8(__m128i __X) /// \param __X /// A 128-bit vector of [8 x i16] whose low element will be broadcast. /// \returns A 128-bit vector of [8 x i16] containing the result. -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcastw_epi16(__m128i __X) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_broadcastw_epi16(__m128i __X) { return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0); } @@ -3264,9 +3220,8 @@ _mm_broadcastw_epi16(__m128i __X) /// \param __X /// A 128-bit vector of [4 x i32] whose low element will be broadcast. /// \returns A 128-bit vector of [4 x i32] containing the result. -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcastd_epi32(__m128i __X) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_broadcastd_epi32(__m128i __X) { return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0); } @@ -3280,9 +3235,8 @@ _mm_broadcastd_epi32(__m128i __X) /// \param __X /// A 128-bit vector of [2 x i64] whose low element will be broadcast. /// \returns A 128-bit vector of [2 x i64] containing the result. -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcastq_epi64(__m128i __X) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_broadcastq_epi64(__m128i __X) { return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0); } @@ -3756,7 +3710,7 @@ _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in /// bits). /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); @@ -3778,7 +3732,7 @@ _mm256_sllv_epi32(__m256i __X, __m256i __Y) /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in /// bits). /// \returns A 128-bit vector of [4 x i32] containing the result. -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); @@ -3800,7 +3754,7 @@ _mm_sllv_epi32(__m128i __X, __m128i __Y) /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in /// bits). /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_sllv_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); @@ -3822,7 +3776,7 @@ _mm256_sllv_epi64(__m256i __X, __m256i __Y) /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in /// bits). /// \returns A 128-bit vector of [2 x i64] containing the result. -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sllv_epi64(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); @@ -3845,7 +3799,7 @@ _mm_sllv_epi64(__m128i __X, __m128i __Y) /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in /// bits). /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srav_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); @@ -3868,7 +3822,7 @@ _mm256_srav_epi32(__m256i __X, __m256i __Y) /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in /// bits). /// \returns A 128-bit vector of [4 x i32] containing the result. -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srav_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); @@ -3890,7 +3844,7 @@ _mm_srav_epi32(__m128i __X, __m128i __Y) /// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in /// bits). /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); @@ -3912,7 +3866,7 @@ _mm256_srlv_epi32(__m256i __X, __m256i __Y) /// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in /// bits). /// \returns A 128-bit vector of [4 x i32] containing the result. -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); @@ -3934,7 +3888,7 @@ _mm_srlv_epi32(__m128i __X, __m128i __Y) /// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in /// bits). /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_srlv_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); @@ -3956,7 +3910,7 @@ _mm256_srlv_epi64(__m256i __X, __m256i __Y) /// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in /// bits). /// \returns A 128-bit vector of [2 x i64] containing the result. -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_srlv_epi64(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); @@ -5289,5 +5243,7 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) #undef __DEFAULT_FN_ATTRS256 #undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR #endif /* __AVX2INTRIN_H */ diff --git a/clang/lib/Headers/avx512bitalgintrin.h b/clang/lib/Headers/avx512bitalgintrin.h index 3c446b3..5cc3207 100644 --- a/clang/lib/Headers/avx512bitalgintrin.h +++ b/clang/lib/Headers/avx512bitalgintrin.h @@ -20,48 +20,42 @@ __target__("avx512bitalg,evex512"), \ __min_vector_width__(512))) -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_popcnt_epi16(__m512i __A) -{ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_popcnt_epi16(__m512i __A) { return (__m512i)__builtin_elementwise_popcount((__v32hu)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) -{ - return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U, - (__v32hi) _mm512_popcnt_epi16(__B), - (__v32hi) __A); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512( + (__mmask32)__U, (__v32hi)_mm512_popcnt_epi16(__B), (__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) -{ - return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(), - __U, - __B); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) { + return _mm512_mask_popcnt_epi16((__m512i)_mm512_setzero_si512(), __U, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_popcnt_epi8(__m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_popcnt_epi8(__m512i __A) { return (__m512i)__builtin_elementwise_popcount((__v64qu)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) -{ - return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U, - (__v64qi) _mm512_popcnt_epi8(__B), - (__v64qi) __A); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_popcnt_epi8(__B), (__v64qi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) -{ - return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(), - __U, - __B); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) { + return _mm512_mask_popcnt_epi8((__m512i)_mm512_setzero_si512(), __U, __B); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS @@ -80,7 +74,7 @@ _mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) __B); } - #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index c854720..eabe215 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -25,6 +25,14 @@ typedef unsigned long long __mmask64; __attribute__((__always_inline__, __nodebug__, \ __target__("avx512bw,no-evex512"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + static __inline __mmask32 __DEFAULT_FN_ATTRS _knot_mask32(__mmask32 __M) { @@ -438,7 +446,7 @@ _mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) { (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mullo_epi16 (__m512i __A, __m512i __B) { return (__m512i) ((__v32hu) __A * (__v32hu) __B); } @@ -473,45 +481,39 @@ _mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W) (__v32hi) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_abs_epi8 (__m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_abs_epi8(__m512i __A) { return (__m512i)__builtin_elementwise_abs((__v64qs)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_abs_epi8(__m512i __W, __mmask64 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_abs_epi8(__A), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_abs_epi8(__mmask64 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_abs_epi8(__A), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_abs_epi16 (__m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_abs_epi16(__m512i __A) { return (__m512i)__builtin_elementwise_abs((__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_abs_epi16(__m512i __W, __mmask32 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_abs_epi16(__A), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_abs_epi16(__mmask32 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_abs_epi16(__A), (__v32hi)_mm512_setzero_si512()); @@ -605,9 +607,8 @@ _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_adds_epi8 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_adds_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B); } @@ -627,9 +628,8 @@ _mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_adds_epi16 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_adds_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B); } @@ -649,9 +649,8 @@ _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_adds_epu8 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_adds_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B); } @@ -671,9 +670,8 @@ _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_adds_epu16 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_adds_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B); } @@ -938,9 +936,8 @@ _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_subs_epi8 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_subs_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B); } @@ -960,9 +957,8 @@ _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_subs_epi16 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_subs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B); } @@ -982,9 +978,8 @@ _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_subs_epu8 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_subs_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B); } @@ -1004,9 +999,8 @@ _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_subs_epu16 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_subs_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B); } @@ -1082,49 +1076,40 @@ _mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mulhi_epi16(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B); +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mulhi_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhw512((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epi16(__A, __B), - (__v32hi)__W); +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512( + (__mmask32)__U, (__v32hi)_mm512_mulhi_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512( + (__mmask32)__U, (__v32hi)_mm512_mulhi_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mulhi_epu16(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B); +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mulhi_epu16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhuw512((__v32hu)__A, (__v32hu)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epu16(__A, __B), - (__v32hi)__W); +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512( + (__mmask32)__U, (__v32hi)_mm512_mulhi_epu16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512( + (__mmask32)__U, (__v32hi)_mm512_mulhi_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1247,7 +1232,7 @@ _mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_unpackhi_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B, 8, 64+8, 9, 64+9, @@ -1282,7 +1267,7 @@ _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_unpackhi_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B, 4, 32+4, 5, 32+5, @@ -1309,7 +1294,7 @@ _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_unpacklo_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B, 0, 64+0, 1, 64+1, @@ -1344,7 +1329,7 @@ _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_unpacklo_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B, 0, 32+0, 1, 32+1, @@ -1371,9 +1356,8 @@ _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi8_epi16(__m256i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepi8_epi16(__m256i __A) { /* This function always performs a signed extension, but __v32qi is a char which may be signed or unsigned, so use __v32qs. */ return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi); @@ -1395,9 +1379,8 @@ _mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu8_epi16(__m256i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepu8_epi16(__m256i __A) { return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi); } @@ -1494,24 +1477,21 @@ _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_slli_epi16(__m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_slli_epi16(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, (int)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, - unsigned int __B) -{ + unsigned int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_slli_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_slli_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); @@ -1586,24 +1566,21 @@ _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srai_epi16(__m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srai_epi16(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, (int)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, - unsigned int __B) -{ + unsigned int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srai_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srai_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); @@ -1631,24 +1608,21 @@ _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srli_epi16(__m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srli_epi16(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, (int)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, - unsigned int __B) -{ + unsigned int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srli_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srli_epi16(__A, (unsigned int)__B), (__v32hi)_mm512_setzero_si512()); @@ -1883,9 +1857,8 @@ _mm512_movm_epi16 (__mmask32 __A) return (__m512i) __builtin_ia32_cvtmask2w512 (__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcastb_epi8 (__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcastb_epi8(__m128i __A) { return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1925,9 +1898,8 @@ _mm512_maskz_set1_epi16 (__mmask32 __M, short __A) (__v32hi) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcastw_epi16 (__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcastw_epi16(__m128i __A) { return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); @@ -2010,5 +1982,7 @@ _mm512_sad_epu8 (__m512i __A, __m512i __B) #undef __DEFAULT_FN_ATTRS512 #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif diff --git a/clang/lib/Headers/avx512cdintrin.h b/clang/lib/Headers/avx512cdintrin.h index 33b552f..39e7671 100644 --- a/clang/lib/Headers/avx512cdintrin.h +++ b/clang/lib/Headers/avx512cdintrin.h @@ -19,6 +19,12 @@ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512cd,evex512"), __min_vector_width__(512))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_conflict_epi64 (__m512i __A) { @@ -63,45 +69,41 @@ _mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_lzcnt_epi32 (__m512i __A) -{ - return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_lzcnt_epi32(__m512i __A) { + return (__m512i)__builtin_elementwise_ctlz((__v16si)__A, + (__v16si)_mm512_set1_epi32(32)); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_lzcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_lzcnt_epi32(__A), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_lzcnt_epi32(__mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_lzcnt_epi32(__A), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_lzcnt_epi64 (__m512i __A) -{ - return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_lzcnt_epi64(__m512i __A) { + return (__m512i)__builtin_elementwise_ctlz( + (__v8di)__A, (__v8di)_mm512_set1_epi64((long long)64)); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_lzcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_lzcnt_epi64(__A), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_lzcnt_epi64(__A), (__v8di)_mm512_setzero_si512()); @@ -121,5 +123,6 @@ _mm512_broadcastmw_epi32 (__mmask16 __A) } #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h index 88b48e3..87d16b47 100644 --- a/clang/lib/Headers/avx512dqintrin.h +++ b/clang/lib/Headers/avx512dqintrin.h @@ -20,6 +20,14 @@ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512dq,no-evex512"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + static __inline __mmask8 __DEFAULT_FN_ATTRS _knot_mask8(__mmask8 __M) { @@ -167,7 +175,7 @@ _mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_pd(__m512d __A, __m512d __B) { return (__m512d)((__v8du)__A ^ (__v8du)__B); } @@ -186,7 +194,7 @@ _mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_ps (__m512 __A, __m512 __B) { return (__m512)((__v16su)__A ^ (__v16su)__B); } @@ -205,7 +213,7 @@ _mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_or_pd(__m512d __A, __m512d __B) { return (__m512d)((__v8du)__A | (__v8du)__B); } @@ -224,7 +232,7 @@ _mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_or_ps(__m512 __A, __m512 __B) { return (__m512)((__v16su)__A | (__v16su)__B); } @@ -243,7 +251,7 @@ _mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_pd(__m512d __A, __m512d __B) { return (__m512d)((__v8du)__A & (__v8du)__B); } @@ -262,7 +270,7 @@ _mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_ps(__m512 __A, __m512 __B) { return (__m512)((__v16su)__A & (__v16su)__B); } @@ -281,7 +289,7 @@ _mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_andnot_pd(__m512d __A, __m512d __B) { return (__m512d)(~(__v8du)__A & (__v8du)__B); } @@ -300,7 +308,7 @@ _mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_andnot_ps(__m512 __A, __m512 __B) { return (__m512)(~(__v16su)__A & (__v16su)__B); } @@ -1076,10 +1084,8 @@ _mm512_movepi64_mask (__m512i __A) return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A); } - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_broadcast_f32x2 (__m128 __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcast_f32x2(__m128 __A) { return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); @@ -1101,9 +1107,8 @@ _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_broadcast_f32x8(__m256 __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcast_f32x8(__m256 __A) { return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7); @@ -1125,9 +1130,8 @@ _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_broadcast_f64x2(__m128d __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcast_f64x2(__m128d __A) { return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, 0, 1, 0, 1, 0, 1, 0, 1); } @@ -1148,9 +1152,8 @@ _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcast_i32x2 (__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcast_i32x2(__m128i __A) { return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); @@ -1172,9 +1175,8 @@ _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcast_i32x8(__m256i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcast_i32x8(__m256i __A) { return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7); @@ -1196,9 +1198,8 @@ _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcast_i64x2(__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcast_i64x2(__m128i __A) { return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, 0, 1, 0, 1, 0, 1, 0, 1); } @@ -1375,5 +1376,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) #undef __DEFAULT_FN_ATTRS512 #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 45e7eeb..0006e33 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -180,9 +180,9 @@ typedef enum #define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr #else -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS #define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 #endif /* Create vectors with repeated elements */ @@ -218,9 +218,8 @@ _mm512_undefined_epi32(void) return (__m512i)__builtin_ia32_undef512(); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcastd_epi32 (__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcastd_epi32(__m128i __A) { return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } @@ -241,9 +240,8 @@ _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) (__v16si) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcastq_epi64 (__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcastq_epi64(__m128i __A) { return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A, 0, 0, 0, 0, 0, 0, 0, 0); } @@ -277,20 +275,20 @@ _mm512_setzero_pd(void) { return __extension__(__m512d){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; } -static __inline __m512 __DEFAULT_FN_ATTRS512 +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_ps(float __w) { return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, __w }; } -static __inline __m512d __DEFAULT_FN_ATTRS512 +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_pd(double __w) { return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w }; } -static __inline __m512i __DEFAULT_FN_ATTRS512 +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi8(char __w) { return __extension__ (__m512i)(__v64qi){ @@ -304,7 +302,7 @@ _mm512_set1_epi8(char __w) __w, __w, __w, __w, __w, __w, __w, __w }; } -static __inline __m512i __DEFAULT_FN_ATTRS512 +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi16(short __w) { return __extension__ (__m512i)(__v32hi){ @@ -314,7 +312,7 @@ _mm512_set1_epi16(short __w) __w, __w, __w, __w, __w, __w, __w, __w }; } -static __inline __m512i __DEFAULT_FN_ATTRS512 +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi32(int __s) { return __extension__ (__m512i)(__v16si){ @@ -330,7 +328,7 @@ _mm512_maskz_set1_epi32(__mmask16 __M, int __A) (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS512 +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set1_epi64(long long __d) { return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; @@ -344,39 +342,33 @@ _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_broadcastss_ps(__m128 __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcastss_ps(__m128 __A) { return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set4_epi32 (int __A, int __B, int __C, int __D) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_set4_epi32(int __A, int __B, int __C, int __D) { return __extension__ (__m512i)(__v16si) { __D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A }; } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set4_epi64 (long long __A, long long __B, long long __C, - long long __D) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_set4_epi64(long long __A, long long __B, long long __C, long long __D) { return __extension__ (__m512i) (__v8di) { __D, __C, __B, __A, __D, __C, __B, __A }; } -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_set4_pd (double __A, double __B, double __C, double __D) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_set4_pd(double __A, double __B, double __C, double __D) { return __extension__ (__m512d) { __D, __C, __B, __A, __D, __C, __B, __A }; } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_set4_ps (float __A, float __B, float __C, float __D) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_set4_ps(float __A, float __B, float __C, float __D) { return __extension__ (__m512) { __D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A }; @@ -394,9 +386,8 @@ _mm512_set4_ps (float __A, float __B, float __C, float __D) #define _mm512_setr4_ps(e0,e1,e2,e3) \ _mm512_set4_ps((e3),(e2),(e1),(e0)) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_broadcastsd_pd(__m128d __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcastsd_pd(__m128d __A) { return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A, 0, 0, 0, 0, 0, 0, 0, 0); } @@ -435,9 +426,8 @@ _mm512_castps512_ps128(__m512 __a) return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); } -static __inline __m256 __DEFAULT_FN_ATTRS512 -_mm512_castps512_ps256 (__m512 __A) -{ +static __inline __m256 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_castps512_ps256(__m512 __A) { return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7); } @@ -516,9 +506,8 @@ _mm512_castsi512_si128 (__m512i __A) return (__m128i)__builtin_shufflevector(__A, __A , 0, 1); } -static __inline __m256i __DEFAULT_FN_ATTRS512 -_mm512_castsi512_si256 (__m512i __A) -{ +static __inline __m256i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_castsi512_si256(__m512i __A) { return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); } @@ -645,15 +634,14 @@ _mm512_zextsi256_si512(__m256i __a) } /* Bitwise operators */ -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi32(__m512i __a, __m512i __b) { return (__m512i)((__v16su)__a & (__v16su)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, (__v16si) _mm512_and_epi32(__a, __b), (__v16si) __src); @@ -666,18 +654,16 @@ _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_epi64(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a & (__v8du)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) -{ - return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k, - (__v8di) _mm512_and_epi64(__a, __b), - (__v8di) __src); +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) { + return (__m512i)__builtin_ia32_selectq_512( + (__mmask8)__k, (__v8di)_mm512_and_epi64(__a, __b), (__v8di)__src); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -687,13 +673,13 @@ _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_andnot_si512 (__m512i __A, __m512i __B) { return (__m512i)(~(__v8du)__A & (__v8du)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_andnot_epi32 (__m512i __A, __m512i __B) { return (__m512i)(~(__v16su)__A & (__v16su)__B); @@ -714,7 +700,7 @@ _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B) __U, __A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_andnot_epi64(__m512i __A, __m512i __B) { return (__m512i)(~(__v8du)__A & (__v8du)__B); @@ -735,7 +721,7 @@ _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B) __U, __A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_or_epi32(__m512i __a, __m512i __b) { return (__m512i)((__v16su)__a | (__v16su)__b); @@ -755,7 +741,7 @@ _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_or_epi64(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a | (__v8du)__b); @@ -775,7 +761,7 @@ _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_epi32(__m512i __a, __m512i __b) { return (__m512i)((__v16su)__a ^ (__v16su)__b); @@ -795,7 +781,7 @@ _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_epi64(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a ^ (__v8du)__b); @@ -815,19 +801,19 @@ _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_and_si512(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a & (__v8du)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_or_si512(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a | (__v8du)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_xor_si512(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a ^ (__v8du)__b); @@ -835,45 +821,38 @@ _mm512_xor_si512(__m512i __a, __m512i __b) /* Arithmetic */ -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_add_pd(__m512d __a, __m512d __b) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_add_pd(__m512d __a, __m512d __b) { return (__m512d)((__v8df)__a + (__v8df)__b); } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_add_ps(__m512 __a, __m512 __b) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_add_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a + (__v16sf)__b); } -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_mul_pd(__m512d __a, __m512d __b) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mul_pd(__m512d __a, __m512d __b) { return (__m512d)((__v8df)__a * (__v8df)__b); } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_mul_ps(__m512 __a, __m512 __b) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mul_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a * (__v16sf)__b); } -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_sub_pd(__m512d __a, __m512d __b) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sub_pd(__m512d __a, __m512d __b) { return (__m512d)((__v8df)__a - (__v8df)__b); } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_sub_ps(__m512 __a, __m512 __b) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sub_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a - (__v16sf)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_add_epi64 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_add_epi64(__m512i __A, __m512i __B) { return (__m512i) ((__v8du) __A + (__v8du) __B); } @@ -1429,9 +1408,8 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) (__v8di)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mul_epi32(__m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mul_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); } @@ -1451,9 +1429,8 @@ _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512 ()); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mul_epu32(__m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mul_epu32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); } @@ -1866,45 +1843,39 @@ _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_abs_epi64(__m512i __A) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_abs_epi64(__m512i __A) { return (__m512i)__builtin_elementwise_abs((__v8di)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_abs_epi64(__m512i __W, __mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_abs_epi64(__A), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_abs_epi64(__mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_abs_epi64(__A), (__v8di)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_abs_epi32(__m512i __A) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_abs_epi32(__m512i __A) { return (__m512i)__builtin_elementwise_abs((__v16si) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_abs_epi32(__m512i __W, __mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_abs_epi32(__A), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_abs_epi32(__mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_abs_epi32(__A), (__v16si)_mm512_setzero_si512()); @@ -2315,9 +2286,8 @@ _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_div_pd(__m512d __a, __m512d __b) -{ +static __inline __m512d + __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_div_pd(__m512d __a, __m512d __b) { return (__m512d)((__v8df)__a/(__v8df)__b); } @@ -2335,9 +2305,8 @@ _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { (__v8df)_mm512_setzero_pd()); } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_div_ps(__m512 __a, __m512 __b) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_div_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a/(__v16sf)__b); } @@ -3615,115 +3584,99 @@ _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) (__v16sf)_mm512_setzero_ps(), \ (__mmask16)(U), (int)(R))) -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_cvtepu32_ps (__m512i __A) -{ +static __inline__ __m512 + __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_cvtepu32_ps(__m512i __A) { return (__m512)__builtin_convertvector((__v16su)__A, __v16sf); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_cvtepu32_ps(__m512 __W, __mmask16 __U, __m512i __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_cvtepu32_ps(__A), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_cvtepu32_ps(__mmask16 __U, __m512i __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_cvtepu32_ps(__A), (__v16sf)_mm512_setzero_ps()); } -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32_pd(__m256i __A) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepi32_pd(__m256i __A) { return (__m512d)__builtin_convertvector((__v8si)__A, __v8df); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_cvtepi32_pd(__m512d __W, __mmask8 __U, __m256i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, (__v8df)_mm512_cvtepi32_pd(__A), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_cvtepi32_pd(__mmask8 __U, __m256i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, (__v8df)_mm512_cvtepi32_pd(__A), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32lo_pd(__m512i __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepi32lo_pd(__m512i __A) { return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A)); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U, __m512i __A) { return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A)); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32_ps (__m512i __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepi32_ps(__m512i __A) { return (__m512)__builtin_convertvector((__v16si)__A, __v16sf); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_cvtepi32_ps(__m512 __W, __mmask16 __U, __m512i __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_cvtepi32_ps(__A), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_cvtepi32_ps(__mmask16 __U, __m512i __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_cvtepi32_ps(__A), (__v16sf)_mm512_setzero_ps()); } -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtepu32_pd(__m256i __A) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepu32_pd(__m256i __A) { return (__m512d)__builtin_convertvector((__v8su)__A, __v8df); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_cvtepu32_pd(__m512d __W, __mmask8 __U, __m256i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, (__v8df)_mm512_cvtepu32_pd(__A), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_cvtepu32_pd(__mmask8 __U, __m256i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, (__v8df)_mm512_cvtepu32_pd(__A), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtepu32lo_pd(__m512i __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepu32lo_pd(__m512i __A) { return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A)); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U, __m512i __A) { return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A)); } @@ -4123,9 +4076,8 @@ _mm512_cvtss_f32(__m512 __a) /* Unpack and Interleave */ -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_unpackhi_pd(__m512d __a, __m512d __b) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_unpackhi_pd(__m512d __a, __m512d __b) { return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); } @@ -4146,9 +4098,8 @@ _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) (__v8df)_mm512_setzero_pd()); } -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_unpacklo_pd(__m512d __a, __m512d __b) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_unpacklo_pd(__m512d __a, __m512d __b) { return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); } @@ -4169,9 +4120,8 @@ _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) (__v8df)_mm512_setzero_pd()); } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_unpackhi_ps(__m512 __a, __m512 __b) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_unpackhi_ps(__m512 __a, __m512 __b) { return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, 2, 18, 3, 19, 2+4, 18+4, 3+4, 19+4, @@ -4195,9 +4145,8 @@ _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) (__v16sf)_mm512_setzero_ps()); } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_unpacklo_ps(__m512 __a, __m512 __b) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_unpacklo_ps(__m512 __a, __m512 __b) { return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, 0, 16, 1, 17, 0+4, 16+4, 1+4, 17+4, @@ -4221,9 +4170,8 @@ _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpackhi_epi32(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_unpackhi_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, 2, 18, 3, 19, 2+4, 18+4, 3+4, 19+4, @@ -4247,9 +4195,8 @@ _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpacklo_epi32(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_unpacklo_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, 0, 16, 1, 17, 0+4, 16+4, 1+4, 17+4, @@ -4273,9 +4220,8 @@ _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpackhi_epi64(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_unpackhi_epi64(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); } @@ -4296,9 +4242,8 @@ _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpacklo_epi64 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_unpacklo_epi64(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); } @@ -4727,9 +4672,8 @@ _mm512_knot(__mmask16 __M) #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \ _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi8_epi32(__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepi8_epi32(__m128i __A) { /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si); @@ -4751,9 +4695,8 @@ _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi8_epi64(__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepi8_epi64(__m128i __A) { /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); @@ -4775,9 +4718,8 @@ _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) (__v8di)_mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32_epi64(__m256i __X) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepi32_epi64(__m256i __X) { return (__m512i)__builtin_convertvector((__v8si)__X, __v8di); } @@ -4797,9 +4739,8 @@ _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi16_epi32(__m256i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepi16_epi32(__m256i __A) { return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si); } @@ -4819,9 +4760,8 @@ _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) (__v16si)_mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi16_epi64(__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepi16_epi64(__m128i __A) { return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di); } @@ -4841,9 +4781,8 @@ _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu8_epi32(__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepu8_epi32(__m128i __A) { return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si); } @@ -4863,9 +4802,8 @@ _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu8_epi64(__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepu8_epi64(__m128i __A) { return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); } @@ -4885,9 +4823,8 @@ _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu32_epi64(__m256i __X) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepu32_epi64(__m256i __X) { return (__m512i)__builtin_convertvector((__v8su)__X, __v8di); } @@ -4907,9 +4844,8 @@ _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu16_epi32(__m256i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepu16_epi32(__m256i __A) { return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si); } @@ -4929,9 +4865,8 @@ _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu16_epi64(__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtepu16_epi64(__m128i __A) { return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di); } @@ -4954,7 +4889,7 @@ _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_rorv_epi32 (__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B); + return (__m512i)__builtin_elementwise_fshr((__v16su)__A,(__v16su)__A, (__v16su)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4976,7 +4911,7 @@ _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_rorv_epi64 (__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B); + return (__m512i)__builtin_elementwise_fshr((__v8du)__A, (__v8du)__A, (__v8du)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5066,7 +5001,7 @@ _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_rolv_epi32 (__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B); + return (__m512i)__builtin_elementwise_fshl((__v16su)__A, (__v16su)__A, (__v16su)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5088,7 +5023,7 @@ _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_rolv_epi64 (__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B); + return (__m512i)__builtin_elementwise_fshl((__v8du)__A, (__v8du)__A, (__v8du)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5133,91 +5068,81 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) (__v8di)_mm512_ror_epi64((A), (B)), \ (__v8di)_mm512_setzero_si512())) -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_slli_epi32(__m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_slli_epi32(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) -{ + unsigned int __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_slli_epi32(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_slli_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_slli_epi64(__m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_slli_epi64(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_slli_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_slli_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srli_epi32(__m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srli_epi32(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) -{ + unsigned int __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_srli_epi32(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_srli_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srli_epi64(__m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srli_epi64(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, - unsigned int __B) -{ + unsigned int __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_srli_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, - unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_srli_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); @@ -5303,7 +5228,7 @@ _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) (__mmask8) __U); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_movedup_pd (__m512d __A) { return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A, @@ -6622,46 +6547,41 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) (__mmask8)(U), \ (int)(R))) -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srai_epi32(__m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srai_epi32(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) -{ + unsigned int __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_srai_epi32(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, - unsigned int __B) { +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_srai_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srai_epi64(__m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srai_epi64(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_srai_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_srai_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); @@ -6827,9 +6747,8 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_broadcast_f32x4(__m128 __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcast_f32x4(__m128 __A) { return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3); @@ -6851,9 +6770,8 @@ _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_broadcast_f64x4(__m256d __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcast_f64x4(__m256d __A) { return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A, 0, 1, 2, 3, 0, 1, 2, 3); } @@ -6874,9 +6792,8 @@ _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcast_i32x4(__m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcast_i32x4(__m128i __A) { return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3); @@ -6898,9 +6815,8 @@ _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcast_i64x4(__m256i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_broadcast_i64x4(__m256i __A) { return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A, 0, 1, 2, 3, 0, 1, 2, 3); } @@ -8665,7 +8581,7 @@ _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) _mm512_setzero_si512()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_movehdup_ps (__m512 __A) { return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, @@ -8688,7 +8604,7 @@ _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_moveldup_ps (__m512 __A) { return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, @@ -8941,70 +8857,57 @@ _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) (__v8df)_mm512_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtps_pd (__m256 __A) -{ +static __inline__ __m512d + __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_cvtps_pd(__m256 __A) { return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_cvtps_pd(__m512d __W, __mmask8 __U, __m256 __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_cvtps_pd(__A), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_cvtps_pd(__A), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtpslo_pd (__m512 __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtpslo_pd(__m512 __A) { return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_cvtpslo_pd(__m512d __W, __mmask8 __U, __m512 __A) { return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, - (__v8df) __A, - (__v8df) __W); +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)__A, + (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, - (__v8df) __A, - (__v8df) _mm512_setzero_pd ()); +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mov_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)__A, + (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, - (__v16sf) __A, - (__v16sf) __W); +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mov_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)__A, + (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, - (__v16sf) __A, - (__v16sf) _mm512_setzero_ps ()); +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)__A, + (__v16sf)_mm512_setzero_ps()); } static __inline__ void __DEFAULT_FN_ATTRS512 @@ -9204,18 +9107,18 @@ _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) (__v8di) __O); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, - char __e58, char __e57, char __e56, char __e55, char __e54, char __e53, - char __e52, char __e51, char __e50, char __e49, char __e48, char __e47, - char __e46, char __e45, char __e44, char __e43, char __e42, char __e41, - char __e40, char __e39, char __e38, char __e37, char __e36, char __e35, - char __e34, char __e33, char __e32, char __e31, char __e30, char __e29, - char __e28, char __e27, char __e26, char __e25, char __e24, char __e23, - char __e22, char __e21, char __e20, char __e19, char __e18, char __e17, - char __e16, char __e15, char __e14, char __e13, char __e12, char __e11, - char __e10, char __e9, char __e8, char __e7, char __e6, char __e5, - char __e4, char __e3, char __e2, char __e1, char __e0) { +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set_epi8( + char __e63, char __e62, char __e61, char __e60, char __e59, char __e58, + char __e57, char __e56, char __e55, char __e54, char __e53, char __e52, + char __e51, char __e50, char __e49, char __e48, char __e47, char __e46, + char __e45, char __e44, char __e43, char __e42, char __e41, char __e40, + char __e39, char __e38, char __e37, char __e36, char __e35, char __e34, + char __e33, char __e32, char __e31, char __e30, char __e29, char __e28, + char __e27, char __e26, char __e25, char __e24, char __e23, char __e22, + char __e21, char __e20, char __e19, char __e18, char __e17, char __e16, + char __e15, char __e14, char __e13, char __e12, char __e11, char __e10, + char __e9, char __e8, char __e7, char __e6, char __e5, char __e4, char __e3, + char __e2, char __e1, char __e0) { return __extension__ (__m512i)(__v64qi) {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, @@ -9228,14 +9131,13 @@ _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63}; } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, - short __e27, short __e26, short __e25, short __e24, short __e23, - short __e22, short __e21, short __e20, short __e19, short __e18, - short __e17, short __e16, short __e15, short __e14, short __e13, - short __e12, short __e11, short __e10, short __e9, short __e8, - short __e7, short __e6, short __e5, short __e4, short __e3, - short __e2, short __e1, short __e0) { +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set_epi16( + short __e31, short __e30, short __e29, short __e28, short __e27, + short __e26, short __e25, short __e24, short __e23, short __e22, + short __e21, short __e20, short __e19, short __e18, short __e17, + short __e16, short __e15, short __e14, short __e13, short __e12, + short __e11, short __e10, short __e9, short __e8, short __e7, short __e6, + short __e5, short __e4, short __e3, short __e2, short __e1, short __e0) { return __extension__ (__m512i)(__v32hi) {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, @@ -9243,12 +9145,9 @@ _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 }; } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set_epi32 (int __A, int __B, int __C, int __D, - int __E, int __F, int __G, int __H, - int __I, int __J, int __K, int __L, - int __M, int __N, int __O, int __P) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set_epi32( + int __A, int __B, int __C, int __D, int __E, int __F, int __G, int __H, + int __I, int __J, int __K, int __L, int __M, int __N, int __O, int __P) { return __extension__ (__m512i)(__v16si) { __P, __O, __N, __M, __L, __K, __J, __I, __H, __G, __F, __E, __D, __C, __B, __A }; @@ -9259,11 +9158,9 @@ _mm512_set_epi32 (int __A, int __B, int __C, int __D, _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \ (e5),(e4),(e3),(e2),(e1),(e0)) -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_set_epi64 (long long __A, long long __B, long long __C, - long long __D, long long __E, long long __F, - long long __G, long long __H) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_set_epi64(long long __A, long long __B, long long __C, long long __D, + long long __E, long long __F, long long __G, long long __H) { return __extension__ (__m512i) (__v8di) { __H, __G, __F, __E, __D, __C, __B, __A }; } @@ -9271,10 +9168,9 @@ _mm512_set_epi64 (long long __A, long long __B, long long __C, #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \ _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_set_pd (double __A, double __B, double __C, double __D, - double __E, double __F, double __G, double __H) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_set_pd(double __A, double __B, double __C, double __D, double __E, + double __F, double __G, double __H) { return __extension__ (__m512d) { __H, __G, __F, __E, __D, __C, __B, __A }; } @@ -9282,12 +9178,10 @@ _mm512_set_pd (double __A, double __B, double __C, double __D, #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \ _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_set_ps (float __A, float __B, float __C, float __D, - float __E, float __F, float __G, float __H, - float __I, float __J, float __K, float __L, - float __M, float __N, float __O, float __P) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_set_ps(float __A, float __B, float __C, float __D, float __E, float __F, + float __G, float __H, float __I, float __J, float __K, float __L, + float __M, float __N, float __O, float __P) { return __extension__ (__m512) { __P, __O, __N, __M, __L, __K, __J, __I, __H, __G, __F, __E, __D, __C, __B, __A }; @@ -9297,27 +9191,23 @@ _mm512_set_ps (float __A, float __B, float __C, float __D, _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \ (e4),(e3),(e2),(e1),(e0)) -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_abs_ps(__m512 __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_abs_ps(__m512 __A) { return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) { return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_abs_pd(__m512d __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_abs_pd(__m512d __A) { return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ; } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) { return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); } @@ -9337,19 +9227,23 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) * This takes log2(n) steps where n is the number of elements in the vector. */ -static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) { +static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_reduce_add_epi64(__m512i __W) { return __builtin_reduce_add((__v8di)__W); } -static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) { +static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_reduce_mul_epi64(__m512i __W) { return __builtin_reduce_mul((__v8di)__W); } -static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) { +static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_reduce_and_epi64(__m512i __W) { return __builtin_reduce_and((__v8di)__W); } -static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) { +static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_reduce_or_epi64(__m512i __W) { return __builtin_reduce_or((__v8di)__W); } @@ -9400,22 +9294,22 @@ _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { return __builtin_ia32_reduce_fmul_pd512(1.0, __W); } -static __inline__ int __DEFAULT_FN_ATTRS512 +static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_add_epi32(__m512i __W) { return __builtin_reduce_add((__v16si)__W); } -static __inline__ int __DEFAULT_FN_ATTRS512 +static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_mul_epi32(__m512i __W) { return __builtin_reduce_mul((__v16si)__W); } -static __inline__ int __DEFAULT_FN_ATTRS512 +static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_and_epi32(__m512i __W) { return __builtin_reduce_and((__v16si)__W); } -static __inline__ int __DEFAULT_FN_ATTRS512 +static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_or_epi32(__m512i __W) { return __builtin_reduce_or((__v16si)__W); } @@ -9466,22 +9360,22 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); } -static __inline__ long long __DEFAULT_FN_ATTRS512 +static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_max_epi64(__m512i __V) { return __builtin_reduce_max((__v8di)__V); } -static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 +static __inline__ unsigned long long __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_max_epu64(__m512i __V) { return __builtin_reduce_max((__v8du)__V); } -static __inline__ long long __DEFAULT_FN_ATTRS512 +static __inline__ long long __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_min_epi64(__m512i __V) { return __builtin_reduce_min((__v8di)__V); } -static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 +static __inline__ unsigned long long __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_min_epu64(__m512i __V) { return __builtin_reduce_min((__v8du)__V); } @@ -9509,22 +9403,22 @@ _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V); return __builtin_reduce_min((__v8du)__V); } -static __inline__ int __DEFAULT_FN_ATTRS512 +static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_max_epi32(__m512i __V) { return __builtin_reduce_max((__v16si)__V); } -static __inline__ unsigned int __DEFAULT_FN_ATTRS512 +static __inline__ unsigned int __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_max_epu32(__m512i __V) { return __builtin_reduce_max((__v16su)__V); } -static __inline__ int __DEFAULT_FN_ATTRS512 +static __inline__ int __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_min_epi32(__m512i __V) { return __builtin_reduce_min((__v16si)__V); } -static __inline__ unsigned int __DEFAULT_FN_ATTRS512 +static __inline__ unsigned int __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_reduce_min_epu32(__m512i __V) { return __builtin_reduce_min((__v16su)__V); } diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h index 92df320..6989b86 100644 --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -33,15 +33,26 @@ typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1))); __target__("avx512fp16,no-evex512"), \ __min_vector_width__(128))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#else +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#endif + static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) { return __a[0]; } -static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) { +static __inline __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_setzero_ph(void) { return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; } -static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) { +static __inline __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_setzero_ph(void) { return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; } @@ -50,7 +61,8 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) { return (__m256h)__builtin_ia32_undef256(); } -static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) { +static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_setzero_ph(void) { return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; @@ -64,14 +76,15 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) { return (__m512h)__builtin_ia32_undef512(); } -static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) { +static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_set1_ph(_Float16 __h) { return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h, __h}; } -static __inline __m512h __DEFAULT_FN_ATTRS512 +static __inline __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, @@ -548,7 +561,8 @@ _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ (__v32hf)_mm512_setzero_ph())) -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) { +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_abs_ph(__m512h __A) { return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A); } @@ -3348,6 +3362,9 @@ _mm512_permutexvar_ph(__m512i __A, __m512h __B) { #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 #undef __DEFAULT_FN_ATTRS512 +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR #endif #endif diff --git a/clang/lib/Headers/avx512vbmi2intrin.h b/clang/lib/Headers/avx512vbmi2intrin.h index 11598c8..f9a5f82 100644 --- a/clang/lib/Headers/avx512vbmi2intrin.h +++ b/clang/lib/Headers/avx512vbmi2intrin.h @@ -215,8 +215,8 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B, - (__v8di)__C); + return (__m512i)__builtin_elementwise_fshl((__v8du)__A, (__v8du)__B, + (__v8du)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -238,8 +238,8 @@ _mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_elementwise_fshl((__v16su)__A, (__v16su)__B, + (__v16su)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -261,8 +261,8 @@ _mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B, - (__v32hi)__C); + return (__m512i)__builtin_elementwise_fshl((__v32hu)__A, (__v32hu)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -284,8 +284,9 @@ _mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B, - (__v8di)__C); + // Ops __A and __B are swapped. + return (__m512i)__builtin_elementwise_fshr((__v8du)__B, (__v8du)__A, + (__v8du)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -307,8 +308,9 @@ _mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + // Ops __A and __B are swapped. + return (__m512i)__builtin_elementwise_fshr((__v16su)__B, (__v16su)__A, + (__v16su)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -330,8 +332,9 @@ _mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B, - (__v32hi)__C); + // Ops __A and __B are swapped. + return (__m512i)__builtin_elementwise_fshr((__v32hu)__B, (__v32hu)__A, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512vlbitalgintrin.h b/clang/lib/Headers/avx512vlbitalgintrin.h index 1b01fe0..21bf858 100644 --- a/clang/lib/Headers/avx512vlbitalgintrin.h +++ b/clang/lib/Headers/avx512vlbitalgintrin.h @@ -24,92 +24,76 @@ __target__("avx512vl,avx512bitalg,no-evex512"), \ __min_vector_width__(256))) -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_popcnt_epi16(__m256i __A) -{ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_popcnt_epi16(__m256i __A) { return (__m256i)__builtin_elementwise_popcount((__v16hu)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) -{ - return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U, - (__v16hi) _mm256_popcnt_epi16(__B), - (__v16hi) __A); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256( + (__mmask16)__U, (__v16hi)_mm256_popcnt_epi16(__B), (__v16hi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) -{ - return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(), - __U, - __B); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) { + return _mm256_mask_popcnt_epi16((__m256i)_mm256_setzero_si256(), __U, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_popcnt_epi16(__m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_popcnt_epi16(__m128i __A) { return (__m128i)__builtin_elementwise_popcount((__v8hu)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) -{ - return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U, - (__v8hi) _mm_popcnt_epi16(__B), - (__v8hi) __A); +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128( + (__mmask8)__U, (__v8hi)_mm_popcnt_epi16(__B), (__v8hi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) -{ - return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(), - __U, - __B); +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) { + return _mm_mask_popcnt_epi16((__m128i)_mm_setzero_si128(), __U, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_popcnt_epi8(__m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_popcnt_epi8(__m256i __A) { return (__m256i)__builtin_elementwise_popcount((__v32qu)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) -{ - return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U, - (__v32qi) _mm256_popcnt_epi8(__B), - (__v32qi) __A); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask32)__U, (__v32qi)_mm256_popcnt_epi8(__B), (__v32qi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) -{ - return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(), - __U, - __B); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) { + return _mm256_mask_popcnt_epi8((__m256i)_mm256_setzero_si256(), __U, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_popcnt_epi8(__m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_popcnt_epi8(__m128i __A) { return (__m128i)__builtin_elementwise_popcount((__v16qu)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) -{ - return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U, - (__v16qi) _mm_popcnt_epi8(__B), - (__v16qi) __A); +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_popcnt_epi8(__B), (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) -{ - return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(), - __U, - __B); +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) { + return _mm_mask_popcnt_epi8((__m128i)_mm_setzero_si128(), __U, __B); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 @@ -147,5 +131,7 @@ _mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 9aedba0..ea61440 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -24,6 +24,14 @@ __target__("avx512vl,avx512bw,no-evex512"), \ __min_vector_width__(256))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + /* Integer compare */ #define _mm_cmp_epi8_mask(a, b, p) \ @@ -478,65 +486,57 @@ _mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W) (__v16hi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_abs_epi8(__A), (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_abs_epi8(__A), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_abs_epi8(__A), (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_abs_epi8(__mmask32 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_abs_epi8(__A), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_abs_epi16(__A), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_abs_epi16(__A), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_abs_epi16(__A), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_abs_epi16(__A), (__v16hi)_mm256_setzero_si256()); @@ -1592,67 +1592,62 @@ _mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhi_epu16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_ia32_selectw_128( + (__mmask8)__U, (__v8hi)_mm_mulhi_epu16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mulhi_epu16(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epu16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_ia32_selectw_256( + (__mmask16)__U, (__v16hi)_mm256_mulhi_epu16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_ia32_selectw_256( + (__mmask16)__U, (__v16hi)_mm256_mulhi_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhi_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_ia32_selectw_128( + (__mmask8)__U, (__v8hi)_mm_mulhi_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mulhi_epi16(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_ia32_selectw_256( + (__mmask16)__U, (__v16hi)_mm256_mulhi_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_ia32_selectw_256( + (__mmask16)__U, (__v16hi)_mm256_mulhi_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_unpackhi_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_unpackhi_epi8(__A, __B), (__v16qi)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -1960,18 +1955,16 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, - unsigned int __B) -{ + unsigned int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_slli_epi16(__A, (int)__B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_slli_epi16(__A, (int)__B), (__v16hi)_mm256_setzero_si256()); @@ -2097,34 +2090,30 @@ _mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_srai_epi16(__A, (int)__B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_srai_epi16(__A, (int)__B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, - unsigned int __B) -{ + unsigned int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_srai_epi16(__A, (int)__B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_srai_epi16(__A, (int)__B), (__v16hi)_mm256_setzero_si256()); @@ -2162,103 +2151,90 @@ _mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_srli_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_srli_epi16(__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_srli_epi16(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_srli_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_srli_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mov_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, (__v8hi) __A, (__v8hi) __W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mov_epi16(__mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, (__v8hi) __A, (__v8hi) _mm_setzero_si128 ()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mov_epi16(__m256i __W, __mmask16 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, (__v16hi) __A, (__v16hi) __W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mov_epi16(__mmask16 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, (__v16hi) __A, (__v16hi) _mm256_setzero_si256 ()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mov_epi8(__m128i __W, __mmask16 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, (__v16qi) __A, (__v16qi) __W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mov_epi8(__mmask16 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, (__v16qi) __A, (__v16qi) _mm_setzero_si128 ()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mov_epi8(__m256i __W, __mmask32 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, (__v32qi) __A, (__v32qi) __W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, (__v32qi) __A, (__v32qi) _mm256_setzero_si256 ()); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) { @@ -2809,353 +2785,353 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ (__v16hi)_mm256_setzero_si256())) -static __inline__ short __DEFAULT_FN_ATTRS128 +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_add_epi16(__m128i __W) { return __builtin_reduce_add((__v8hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS128 +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_mul_epi16(__m128i __W) { return __builtin_reduce_mul((__v8hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS128 +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_and_epi16(__m128i __W) { return __builtin_reduce_and((__v8hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS128 +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_or_epi16(__m128i __W) { return __builtin_reduce_or((__v8hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS128 -_mm_mask_reduce_add_epi16( __mmask8 __M, __m128i __W) { +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_reduce_add_epi16(__mmask8 __M, __m128i __W) { __W = _mm_maskz_mov_epi16(__M, __W); return __builtin_reduce_add((__v8hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS128 -_mm_mask_reduce_mul_epi16( __mmask8 __M, __m128i __W) { +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_reduce_mul_epi16(__mmask8 __M, __m128i __W) { __W = _mm_mask_mov_epi16(_mm_set1_epi16(1), __M, __W); return __builtin_reduce_mul((__v8hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS128 -_mm_mask_reduce_and_epi16( __mmask8 __M, __m128i __W) { +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_reduce_and_epi16(__mmask8 __M, __m128i __W) { __W = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __W); return __builtin_reduce_and((__v8hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS128 +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_or_epi16(__mmask8 __M, __m128i __W) { __W = _mm_maskz_mov_epi16(__M, __W); return __builtin_reduce_or((__v8hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS128 +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_max_epi16(__m128i __V) { return __builtin_reduce_max((__v8hi)__V); } -static __inline__ unsigned short __DEFAULT_FN_ATTRS128 +static __inline__ unsigned short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_max_epu16(__m128i __V) { return __builtin_reduce_max((__v8hu)__V); } -static __inline__ short __DEFAULT_FN_ATTRS128 +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_min_epi16(__m128i __V) { return __builtin_reduce_min((__v8hi)__V); } -static __inline__ unsigned short __DEFAULT_FN_ATTRS128 +static __inline__ unsigned short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_min_epu16(__m128i __V) { return __builtin_reduce_min((__v8hu)__V); } -static __inline__ short __DEFAULT_FN_ATTRS128 +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_max_epi16(__mmask16 __M, __m128i __V) { __V = _mm_mask_mov_epi16(_mm_set1_epi16(-32767-1), __M, __V); return __builtin_reduce_max((__v8hi)__V); } -static __inline__ unsigned short __DEFAULT_FN_ATTRS128 +static __inline__ unsigned short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_max_epu16(__mmask16 __M, __m128i __V) { __V = _mm_maskz_mov_epi16(__M, __V); return __builtin_reduce_max((__v8hu)__V); } -static __inline__ short __DEFAULT_FN_ATTRS128 +static __inline__ short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_min_epi16(__mmask16 __M, __m128i __V) { __V = _mm_mask_mov_epi16(_mm_set1_epi16(32767), __M, __V); return __builtin_reduce_min((__v8hi)__V); } -static __inline__ unsigned short __DEFAULT_FN_ATTRS128 +static __inline__ unsigned short __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_min_epu16(__mmask16 __M, __m128i __V) { __V = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __V); return __builtin_reduce_min((__v8hu)__V); } -static __inline__ short __DEFAULT_FN_ATTRS256 +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_add_epi16(__m256i __W) { return __builtin_reduce_add((__v16hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS256 +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_mul_epi16(__m256i __W) { return __builtin_reduce_mul((__v16hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS256 +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_and_epi16(__m256i __W) { return __builtin_reduce_and((__v16hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS256 +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_or_epi16(__m256i __W) { return __builtin_reduce_or((__v16hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS256 -_mm256_mask_reduce_add_epi16( __mmask16 __M, __m256i __W) { +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_reduce_add_epi16(__mmask16 __M, __m256i __W) { __W = _mm256_maskz_mov_epi16(__M, __W); return __builtin_reduce_add((__v16hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS256 -_mm256_mask_reduce_mul_epi16( __mmask16 __M, __m256i __W) { +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_reduce_mul_epi16(__mmask16 __M, __m256i __W) { __W = _mm256_mask_mov_epi16(_mm256_set1_epi16(1), __M, __W); return __builtin_reduce_mul((__v16hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS256 -_mm256_mask_reduce_and_epi16( __mmask16 __M, __m256i __W) { +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_reduce_and_epi16(__mmask16 __M, __m256i __W) { __W = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __W); return __builtin_reduce_and((__v16hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS256 +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_or_epi16(__mmask16 __M, __m256i __W) { __W = _mm256_maskz_mov_epi16(__M, __W); return __builtin_reduce_or((__v16hi)__W); } -static __inline__ short __DEFAULT_FN_ATTRS256 +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_max_epi16(__m256i __V) { return __builtin_reduce_max((__v16hi)__V); } -static __inline__ unsigned short __DEFAULT_FN_ATTRS256 +static __inline__ unsigned short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_max_epu16(__m256i __V) { return __builtin_reduce_max((__v16hu)__V); } -static __inline__ short __DEFAULT_FN_ATTRS256 +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_min_epi16(__m256i __V) { return __builtin_reduce_min((__v16hi)__V); } -static __inline__ unsigned short __DEFAULT_FN_ATTRS256 +static __inline__ unsigned short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_min_epu16(__m256i __V) { return __builtin_reduce_min((__v16hu)__V); } -static __inline__ short __DEFAULT_FN_ATTRS256 +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_max_epi16(__mmask16 __M, __m256i __V) { __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-32767-1), __M, __V); return __builtin_reduce_max((__v16hi)__V); } -static __inline__ unsigned short __DEFAULT_FN_ATTRS256 +static __inline__ unsigned short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_max_epu16(__mmask16 __M, __m256i __V) { __V = _mm256_maskz_mov_epi16(__M, __V); return __builtin_reduce_max((__v16hu)__V); } -static __inline__ short __DEFAULT_FN_ATTRS256 +static __inline__ short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_min_epi16(__mmask16 __M, __m256i __V) { __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(32767), __M, __V); return __builtin_reduce_min((__v16hi)__V); } -static __inline__ unsigned short __DEFAULT_FN_ATTRS256 +static __inline__ unsigned short __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_min_epu16(__mmask16 __M, __m256i __V) { __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __V); return __builtin_reduce_min((__v16hu)__V); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_add_epi8(__m128i __W) { return __builtin_reduce_add((__v16qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_mul_epi8(__m128i __W) { return __builtin_reduce_mul((__v16qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_and_epi8(__m128i __W) { return __builtin_reduce_and((__v16qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_or_epi8(__m128i __W) { return __builtin_reduce_or((__v16qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_add_epi8(__mmask16 __M, __m128i __W) { __W = _mm_maskz_mov_epi8(__M, __W); return __builtin_reduce_add((__v16qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_mul_epi8(__mmask16 __M, __m128i __W) { __W = _mm_mask_mov_epi8(_mm_set1_epi8(1), __M, __W); return __builtin_reduce_mul((__v16qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_and_epi8(__mmask16 __M, __m128i __W) { __W = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __W); return __builtin_reduce_and((__v16qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_or_epi8(__mmask16 __M, __m128i __W) { __W = _mm_maskz_mov_epi8(__M, __W); return __builtin_reduce_or((__v16qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_max_epi8(__m128i __V) { return __builtin_reduce_max((__v16qs)__V); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS128 +static __inline__ unsigned char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_max_epu8(__m128i __V) { return __builtin_reduce_max((__v16qu)__V); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_min_epi8(__m128i __V) { return __builtin_reduce_min((__v16qs)__V); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS128 +static __inline__ unsigned char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_reduce_min_epu8(__m128i __V) { return __builtin_reduce_min((__v16qu)__V); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_max_epi8(__mmask16 __M, __m128i __V) { __V = _mm_mask_mov_epi8(_mm_set1_epi8(-127-1), __M, __V); return __builtin_reduce_max((__v16qs)__V); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS128 +static __inline__ unsigned char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_max_epu8(__mmask16 __M, __m128i __V) { __V = _mm_maskz_mov_epi8(__M, __V); return __builtin_reduce_max((__v16qu)__V); } -static __inline__ signed char __DEFAULT_FN_ATTRS128 +static __inline__ signed char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_min_epi8(__mmask16 __M, __m128i __V) { __V = _mm_mask_mov_epi8(_mm_set1_epi8(127), __M, __V); return __builtin_reduce_min((__v16qs)__V); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS128 +static __inline__ unsigned char __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_reduce_min_epu8(__mmask16 __M, __m128i __V) { __V = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __V); return __builtin_reduce_min((__v16qu)__V); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_add_epi8(__m256i __W) { return __builtin_reduce_add((__v32qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_mul_epi8(__m256i __W) { return __builtin_reduce_mul((__v32qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_and_epi8(__m256i __W) { return __builtin_reduce_and((__v32qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_or_epi8(__m256i __W) { return __builtin_reduce_or((__v32qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_add_epi8(__mmask32 __M, __m256i __W) { __W = _mm256_maskz_mov_epi8(__M, __W); return __builtin_reduce_add((__v32qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_mul_epi8(__mmask32 __M, __m256i __W) { __W = _mm256_mask_mov_epi8(_mm256_set1_epi8(1), __M, __W); return __builtin_reduce_mul((__v32qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_and_epi8(__mmask32 __M, __m256i __W) { __W = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __W); return __builtin_reduce_and((__v32qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_or_epi8(__mmask32 __M, __m256i __W) { __W = _mm256_maskz_mov_epi8(__M, __W); return __builtin_reduce_or((__v32qs)__W); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_max_epi8(__m256i __V) { return __builtin_reduce_max((__v32qs)__V); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS256 +static __inline__ unsigned char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_max_epu8(__m256i __V) { return __builtin_reduce_max((__v32qu)__V); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_min_epi8(__m256i __V) { return __builtin_reduce_min((__v32qs)__V); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS256 +static __inline__ unsigned char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_reduce_min_epu8(__m256i __V) { return __builtin_reduce_min((__v32qu)__V); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_max_epi8(__mmask32 __M, __m256i __V) { __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-127-1), __M, __V); return __builtin_reduce_max((__v32qs)__V); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS256 +static __inline__ unsigned char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_max_epu8(__mmask32 __M, __m256i __V) { __V = _mm256_maskz_mov_epi8(__M, __V); return __builtin_reduce_max((__v32qu)__V); } -static __inline__ signed char __DEFAULT_FN_ATTRS256 +static __inline__ signed char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_min_epi8(__mmask32 __M, __m256i __V) { __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(127), __M, __V); return __builtin_reduce_min((__v32qs)__V); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS256 +static __inline__ unsigned char __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __V) { __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __V); return __builtin_reduce_min((__v32qu)__V); @@ -3163,5 +3139,7 @@ _mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __V) { #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif /* __AVX512VLBWINTRIN_H */ diff --git a/clang/lib/Headers/avx512vlcdintrin.h b/clang/lib/Headers/avx512vlcdintrin.h index 923e2c5..8f42675 100644 --- a/clang/lib/Headers/avx512vlcdintrin.h +++ b/clang/lib/Headers/avx512vlcdintrin.h @@ -23,6 +23,14 @@ __target__("avx512vl,avx512cd,no-evex512"), \ __min_vector_width__(256))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#else +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#endif + static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastmb_epi64 (__mmask8 __A) { @@ -136,89 +144,81 @@ _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_lzcnt_epi32 (__m128i __A) -{ - return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A); +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_lzcnt_epi32(__m128i __A) { + return (__m128i)__builtin_elementwise_ctlz((__v4si)__A, + (__v4si)_mm_set1_epi32(32)); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_lzcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_lzcnt_epi32(__A), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_lzcnt_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_lzcnt_epi32(__A), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_lzcnt_epi32 (__m256i __A) -{ - return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_lzcnt_epi32(__m256i __A) { + return (__m256i)__builtin_elementwise_ctlz((__v8si)__A, + (__v8si)_mm256_set1_epi32(32)); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_lzcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_lzcnt_epi32(__A), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_lzcnt_epi32(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_lzcnt_epi32(__A), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_lzcnt_epi64 (__m128i __A) -{ - return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A); +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_lzcnt_epi64(__m128i __A) { + return (__m128i)__builtin_elementwise_ctlz( + (__v2di)__A, (__v2di)_mm_set1_epi64x((long long)64)); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_lzcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_lzcnt_epi64(__A), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_lzcnt_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_lzcnt_epi64(__A), (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_lzcnt_epi64 (__m256i __A) -{ - return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_lzcnt_epi64(__m256i __A) { + return (__m256i)__builtin_elementwise_ctlz( + (__v4di)__A, (__v4di)_mm256_set1_epi64x((long long)64)); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_lzcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_lzcnt_epi64(__A), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_lzcnt_epi64(__A), (__v4di)_mm256_setzero_si256()); @@ -226,5 +226,7 @@ _mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif /* __AVX512VLCDINTRIN_H */ diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h index 272cdd8..ceebd09 100644 --- a/clang/lib/Headers/avx512vldqintrin.h +++ b/clang/lib/Headers/avx512vldqintrin.h @@ -24,6 +24,14 @@ __target__("avx512vl,avx512dq,no-evex512"), \ __min_vector_width__(256))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#else +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#endif + static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi64 (__m256i __A, __m256i __B) { return (__m256i) ((__v4du) __A * (__v4du) __B); @@ -956,9 +964,8 @@ _mm256_movepi64_mask (__m256i __A) return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_broadcast_f32x2 (__m128 __A) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcast_f32x2(__m128 __A) { return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 0, 1, 0, 1, 0, 1, 0, 1); } @@ -979,9 +986,8 @@ _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_broadcast_f64x2(__m128d __A) -{ +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcast_f64x2(__m128d __A) { return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, 0, 1, 0, 1); } @@ -1002,9 +1008,8 @@ _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcast_i32x2 (__m128i __A) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_broadcast_i32x2(__m128i __A) { return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 0, 1, 0, 1); } @@ -1025,9 +1030,8 @@ _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcast_i32x2 (__m128i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcast_i32x2(__m128i __A) { return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 0, 1, 0, 1, 0, 1, 0, 1); } @@ -1048,9 +1052,8 @@ _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcast_i64x2(__m128i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcast_i64x2(__m128i __A) { return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, 0, 1, 0, 1); } @@ -1169,5 +1172,7 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR #endif diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h index a12acb7..98ad9b5 100644 --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -26,6 +26,14 @@ __target__("avx512fp16,avx512vl,no-evex512"), \ __min_vector_width__(128))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#else +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#endif + static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) { return __a[0]; } @@ -315,11 +323,13 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U, (__v8hf)_mm_setzero_ph()); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_abs_ph(__m256h __A) { return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_abs_ph(__m128h __A) { return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); } @@ -1419,8 +1429,8 @@ _mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) { static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, - (__v8hf)__C); + return (__m128h)__builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, + (__v8hf)__C); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A, @@ -1429,7 +1439,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A, __m128h __C) { return (__m128h)__builtin_ia32_selectph_128( (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), (__v8hf)__A); } @@ -1437,7 +1447,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { return (__m128h)__builtin_ia32_selectph_128( (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), (__v8hf)__C); } @@ -1445,15 +1455,15 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { return (__m128h)__builtin_ia32_selectph_128( (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), (__v8hf)_mm_setzero_ph()); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, - -(__v8hf)__C); + return (__m128h)__builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, + -(__v8hf)__C); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A, @@ -1476,7 +1486,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { return (__m128h)__builtin_ia32_selectph_128( (__mmask8)__U, - __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + __builtin_elementwise_fma(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), (__v8hf)__C); } @@ -1484,7 +1494,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { return (__m128h)__builtin_ia32_selectph_128( (__mmask8)__U, - __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + __builtin_elementwise_fma(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), (__v8hf)_mm_setzero_ph()); } @@ -1492,22 +1502,22 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { return (__m128h)__builtin_ia32_selectph_128( (__mmask8)__U, - __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + __builtin_elementwise_fma(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), (__v8hf)_mm_setzero_ph()); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, - (__v16hf)__C); + return (__m256h)__builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, + (__v16hf)__C); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), (__v16hf)__A); } @@ -1515,7 +1525,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), (__v16hf)__C); } @@ -1523,22 +1533,22 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), (__v16hf)_mm256_setzero_ph()); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, - -(__v16hf)__C); + return (__m256h)__builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, + -(__v16hf)__C); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), (__v16hf)__A); } @@ -1546,7 +1556,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), (__v16hf)_mm256_setzero_ph()); } @@ -1554,7 +1564,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + __builtin_elementwise_fma(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), (__v16hf)__C); } @@ -1562,7 +1572,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + __builtin_elementwise_fma(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), (__v16hf)_mm256_setzero_ph()); } @@ -1570,7 +1580,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + __builtin_elementwise_fma(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), (__v16hf)_mm256_setzero_ph()); } @@ -1684,7 +1694,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { return (__m128h)__builtin_ia32_selectph_128( (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + __builtin_elementwise_fma((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), (__v8hf)__C); } @@ -1692,7 +1702,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + __builtin_elementwise_fma((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), (__v16hf)__C); } @@ -1715,45 +1725,45 @@ _mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, - (__v8hf)__C); + return (__m128h)__builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B, + (__v8hf)__C); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { return (__m128h)__builtin_ia32_selectph_128( (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C), + __builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C), (__v8hf)__A); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, - (__v16hf)__C); + return (__m256h)__builtin_elementwise_fma((__v16hf)__A, -(__v16hf)__B, + (__v16hf)__C); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C), + __builtin_elementwise_fma((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C), (__v16hf)__A); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, - -(__v8hf)__C); + return (__m128h)__builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B, + -(__v8hf)__C); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { return (__m128h)__builtin_ia32_selectph_128( (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), + __builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), (__v8hf)__A); } @@ -1761,22 +1771,22 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { return (__m128h)__builtin_ia32_selectph_128( (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), + __builtin_elementwise_fma((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), (__v8hf)__C); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, - -(__v16hf)__C); + return (__m256h)__builtin_elementwise_fma((__v16hf)__A, -(__v16hf)__B, + -(__v16hf)__C); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), + __builtin_elementwise_fma((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), (__v16hf)__A); } @@ -1784,7 +1794,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { return (__m256h)__builtin_ia32_selectph_256( (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), + __builtin_elementwise_fma((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), (__v16hf)__C); } @@ -2066,6 +2076,8 @@ _mm_reduce_min_ph(__m128h __V) { #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR #endif #endif diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 2a5f7b4..6e16d2d 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -23,6 +23,14 @@ __target__("avx512vl,no-evex512"), \ __min_vector_width__(256))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#else +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#endif + typedef short __v2hi __attribute__((__vector_size__(4))); typedef char __v4qi __attribute__((__vector_size__(4))); typedef char __v2qi __attribute__((__vector_size__(2))); @@ -453,9 +461,8 @@ _mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_and_epi32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_and_epi32(__m256i __a, __m256i __b) { return (__m256i)((__v8su)__a & (__v8su)__b); } @@ -473,9 +480,8 @@ _mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B) return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_and_epi32(__m128i __a, __m128i __b) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_and_epi32(__m128i __a, __m128i __b) { return (__m128i)((__v4su)__a & (__v4su)__b); } @@ -899,321 +905,289 @@ _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __A); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)__A); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __C); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df) __A); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)__A); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __C); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd (-(__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __A); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)__A); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __C); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df) __A); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)__A); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __C); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 (-(__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __A); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)__A); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __C); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __A); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)__A); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __C); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps (-(__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __A); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)__A); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __C); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __A); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)__A); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __C); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 (-(__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 @@ -1420,41 +1394,37 @@ _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df) __C); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df) __C); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __C); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __C); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 @@ -1500,121 +1470,109 @@ _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - -(__v2df) __B, - (__v2df) __C), - (__v2df) __A); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v2df)__A, -(__v2df)__B, (__v2df)__C), + (__v2df)__A); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - -(__v4df) __B, - (__v4df) __C), - (__v4df) __A); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v4df)__A, -(__v4df)__B, (__v4df)__C), + (__v4df)__A); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - -(__v4sf) __B, - (__v4sf) __C), - (__v4sf) __A); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C), + (__v4sf)__A); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - -(__v8sf) __B, - (__v8sf) __C), - (__v8sf) __A); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v8sf)__A, -(__v8sf)__B, (__v8sf)__C), + (__v8sf)__A); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - -(__v2df) __B, - -(__v2df) __C), - (__v2df) __A); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v2df)__A, -(__v2df)__B, -(__v2df)__C), + (__v2df)__A); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - -(__v2df) __B, - -(__v2df) __C), - (__v2df) __C); + return (__m128d)__builtin_ia32_selectpd_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v2df)__A, -(__v2df)__B, -(__v2df)__C), + (__v2df)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - -(__v4df) __B, - -(__v4df) __C), - (__v4df) __A); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v4df)__A, -(__v4df)__B, -(__v4df)__C), + (__v4df)__A); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - -(__v4df) __B, - -(__v4df) __C), - (__v4df) __C); + return (__m256d)__builtin_ia32_selectpd_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v4df)__A, -(__v4df)__B, -(__v4df)__C), + (__v4df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - -(__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __A); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C), + (__v4sf)__A); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - -(__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __C); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, + __builtin_elementwise_fma((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C), + (__v4sf)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - -(__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __A); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v8sf)__A, -(__v8sf)__B, -(__v8sf)__C), + (__v8sf)__A); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - -(__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __C); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, + __builtin_elementwise_fma((__v8sf)__A, -(__v8sf)__B, -(__v8sf)__C), + (__v8sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 @@ -2964,67 +2922,67 @@ _mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_abs_epi32(__A), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_abs_epi32(__A), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_abs_epi32(__A), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_abs_epi32(__A), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_abs_epi64 (__m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_abs_epi64(__m128i __A) { return (__m128i)__builtin_elementwise_abs((__v2di)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_abs_epi64(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_abs_epi64(__A), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_abs_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_abs_epi64(__A), (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_abs_epi64 (__m256i __A) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_abs_epi64(__m256i __A) { return (__m256i)__builtin_elementwise_abs((__v4di)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_abs_epi64(__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_abs_epi64(__A), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_abs_epi64(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_abs_epi64(__A), (__v4di)_mm256_setzero_si256()); @@ -4358,7 +4316,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_rolv_epi32 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B); + return (__m128i)__builtin_elementwise_fshl((__v4su)__A, (__v4su)__A, (__v4su)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -4380,7 +4338,7 @@ _mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_rolv_epi32 (__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_prolvd256((__v8si)__A, (__v8si)__B); + return (__m256i)__builtin_elementwise_fshl((__v8su)__A, (__v8su)__A, (__v8su)__B); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -4402,7 +4360,7 @@ _mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_rolv_epi64 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prolvq128((__v2di)__A, (__v2di)__B); + return (__m128i)__builtin_elementwise_fshl((__v2du)__A, (__v2du)__A, (__v2du)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -4424,7 +4382,7 @@ _mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_rolv_epi64 (__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_prolvq256((__v4di)__A, (__v4di)__B); + return (__m256i)__builtin_elementwise_fshl((__v4du)__A, (__v4du)__A, (__v4du)__B); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -4543,17 +4501,16 @@ _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, + unsigned int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_slli_epi32(__A, (int)__B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_slli_epi32(__A, (int)__B), (__v8si)_mm256_setzero_si256()); @@ -4607,17 +4564,16 @@ _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B) (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, + unsigned int __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_slli_epi64(__A, (int)__B), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_slli_epi64(__A, (int)__B), (__v4di)_mm256_setzero_si256()); @@ -4626,7 +4582,7 @@ _mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_rorv_epi32 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prorvd128((__v4si)__A, (__v4si)__B); + return (__m128i)__builtin_elementwise_fshr((__v4su)__A, (__v4su)__A, (__v4su)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -4648,7 +4604,7 @@ _mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_rorv_epi32 (__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_prorvd256((__v8si)__A, (__v8si)__B); + return (__m256i)__builtin_elementwise_fshr((__v8su)__A, (__v8su)__A, (__v8su)__B); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -4670,7 +4626,7 @@ _mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_rorv_epi64 (__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prorvq128((__v2di)__A, (__v2di)__B); + return (__m128i)__builtin_elementwise_fshr((__v2du)__A, (__v2du)__A, (__v2du)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -4692,7 +4648,7 @@ _mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_rorv_epi64 (__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_prorvq256((__v4di)__A, (__v4di)__B); + return (__m256i)__builtin_elementwise_fshr((__v4du)__A, (__v4du)__A, (__v4du)__B); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -4711,7 +4667,7 @@ _mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -4719,7 +4675,7 @@ _mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -4727,7 +4683,7 @@ _mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4735,7 +4691,7 @@ _mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4743,7 +4699,7 @@ _mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4751,7 +4707,7 @@ _mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4759,7 +4715,7 @@ _mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4767,7 +4723,7 @@ _mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4775,7 +4731,7 @@ _mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -4783,7 +4739,7 @@ _mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -4791,7 +4747,7 @@ _mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4799,7 +4755,7 @@ _mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4807,7 +4763,7 @@ _mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4815,7 +4771,7 @@ _mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4823,7 +4779,7 @@ _mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4831,7 +4787,7 @@ _mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4887,17 +4843,16 @@ _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, + unsigned int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_srli_epi32(__A, (int)__B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_srli_epi32(__A, (int)__B), (__v8si)_mm256_setzero_si256()); @@ -4951,23 +4906,22 @@ _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B) (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, + unsigned int __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_srli_epi64(__A, (int)__B), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_srli_epi64(__A, (int)__B), (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4975,7 +4929,7 @@ _mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4983,7 +4937,7 @@ _mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4991,7 +4945,7 @@ _mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6410,33 +6364,30 @@ _mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_srai_epi32(__A, (int)__B), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_srai_epi32(__A, (int)__B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, + unsigned int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_srai_epi32(__A, (int)__B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_srai_epi32(__A, (int)__B), (__v8si)_mm256_setzero_si256()); @@ -6486,46 +6437,40 @@ _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_srai_epi64(__m128i __A, unsigned int __imm) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_srai_epi64(__m128i __A, unsigned int __imm) { return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, (int)__imm); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_srai_epi64( + __m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ (__v2di)_mm_srai_epi64(__A, __imm), \ (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ (__v2di)_mm_srai_epi64(__A, __imm), \ (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srai_epi64(__m256i __A, unsigned int __imm) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srai_epi64(__m256i __A, unsigned int __imm) { return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, (int)__imm); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, - unsigned int __imm) -{ + unsigned int __imm) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ (__v4di)_mm256_srai_epi64(__A, __imm), \ (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ (__v4di)_mm256_srai_epi64(__A, __imm), \ (__v4di)_mm256_setzero_si256()); @@ -6792,9 +6737,8 @@ _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_broadcast_f32x4(__m128 __A) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcast_f32x4(__m128 __A) { return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 0, 1, 2, 3, 0, 1, 2, 3); } @@ -6815,9 +6759,8 @@ _mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcast_i32x4(__m128i __A) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_broadcast_i32x4(__m128i __A) { return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 0, 1, 2, 3, 0, 1, 2, 3); } @@ -8306,68 +8249,52 @@ _mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) (__v4si)_mm_shuffle_epi32((A), (I)), \ (__v4si)_mm_setzero_si128())) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) -{ - return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, - (__v2df) __A, - (__v2df) __W); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)__A, + (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_pd (__mmask8 __U, __m128d __A) -{ - return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, - (__v2df) __A, - (__v2df) _mm_setzero_pd ()); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mov_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)__A, + (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) -{ - return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, - (__v4df) __A, - (__v4df) __W); +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mov_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)__A, + (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) -{ - return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, - (__v4df) __A, - (__v4df) _mm256_setzero_pd ()); +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mov_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)__A, + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, - (__v4sf) __A, - (__v4sf) __W); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mov_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)__A, + (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_ps (__mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, - (__v4sf) __A, - (__v4sf) _mm_setzero_ps ()); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mov_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)__A, + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) -{ - return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, - (__v8sf) __A, - (__v8sf) __W); +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mov_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)__A, + (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) -{ - return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, - (__v8sf) __A, - (__v8sf) _mm256_setzero_ps ()); +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mov_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)__A, + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -8430,8 +8357,9 @@ _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) #define _mm256_mask_cvtps_ph _mm256_mask_cvt_roundps_ph #define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph - #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR #endif /* __AVX512VLINTRIN_H */ diff --git a/clang/lib/Headers/avx512vlvbmi2intrin.h b/clang/lib/Headers/avx512vlvbmi2intrin.h index 77af2d5..04db52c 100644 --- a/clang/lib/Headers/avx512vlvbmi2intrin.h +++ b/clang/lib/Headers/avx512vlvbmi2intrin.h @@ -415,8 +415,8 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B, - (__v4di)__C); + return (__m256i)__builtin_elementwise_fshl((__v4du)__A, (__v4du)__B, + (__v4du)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -438,8 +438,8 @@ _mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B, - (__v2di)__C); + return (__m128i)__builtin_elementwise_fshl((__v2du)__A, (__v2du)__B, + (__v2du)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -461,8 +461,8 @@ _mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B, - (__v8si)__C); + return (__m256i)__builtin_elementwise_fshl((__v8su)__A, (__v8su)__B, + (__v8su)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -484,8 +484,8 @@ _mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B, - (__v4si)__C); + return (__m128i)__builtin_elementwise_fshl((__v4su)__A, (__v4su)__B, + (__v4su)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -507,8 +507,8 @@ _mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B, - (__v16hi)__C); + return (__m256i)__builtin_elementwise_fshl((__v16hu)__A, (__v16hu)__B, + (__v16hu)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -530,8 +530,8 @@ _mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B, - (__v8hi)__C); + return (__m128i)__builtin_elementwise_fshl((__v8hu)__A, (__v8hu)__B, + (__v8hu)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -553,8 +553,9 @@ _mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B, - (__v4di)__C); + // Ops __A and __B are swapped. + return (__m256i)__builtin_elementwise_fshr((__v4du)__B, (__v4du)__A, + (__v4du)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -576,8 +577,9 @@ _mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B, - (__v2di)__C); + // Ops __A and __B are swapped. + return (__m128i)__builtin_elementwise_fshr((__v2du)__B, (__v2du)__A, + (__v2du)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -599,8 +601,9 @@ _mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B, - (__v8si)__C); + // Ops __A and __B are swapped. + return (__m256i)__builtin_elementwise_fshr((__v8su)__B, (__v8su)__A, + (__v8su)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -622,8 +625,9 @@ _mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B, - (__v4si)__C); + // Ops __A and __B are swapped. + return (__m128i)__builtin_elementwise_fshr((__v4su)__B, (__v4su)__A, + (__v4su)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -645,8 +649,9 @@ _mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B, - (__v16hi)__C); + // Ops __A and __B are swapped. + return (__m256i)__builtin_elementwise_fshr((__v16hu)__B, (__v16hu)__A, + (__v16hu)__C); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -668,8 +673,9 @@ _mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B, - (__v8hi)__C); + // Ops __A and __B are swapped. + return (__m128i)__builtin_elementwise_fshr((__v8hu)__B, (__v8hu)__A, + (__v8hu)__C); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/avx512vpopcntdqintrin.h b/clang/lib/Headers/avx512vpopcntdqintrin.h index e24c2c5..ac71808 100644 --- a/clang/lib/Headers/avx512vpopcntdqintrin.h +++ b/clang/lib/Headers/avx512vpopcntdqintrin.h @@ -16,19 +16,19 @@ #define __AVX512VPOPCNTDQINTRIN_H /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vpopcntdq,evex512"), \ - __min_vector_width__(512))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr + __min_vector_width__(512))) constexpr #else -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vpopcntdq,evex512"), \ + __min_vector_width__(512))) #endif -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_popcnt_epi64(__m512i __A) { +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) { return (__m512i)__builtin_elementwise_popcount((__v8du)__A); } @@ -43,8 +43,7 @@ _mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_popcnt_epi32(__m512i __A) { +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) { return (__m512i)__builtin_elementwise_popcount((__v16su)__A); } diff --git a/clang/lib/Headers/avx512vpopcntdqvlintrin.h b/clang/lib/Headers/avx512vpopcntdqvlintrin.h index b6c819b..bed951b 100644 --- a/clang/lib/Headers/avx512vpopcntdqvlintrin.h +++ b/clang/lib/Headers/avx512vpopcntdqvlintrin.h @@ -16,6 +16,17 @@ #define __AVX512VPOPCNTDQVLINTRIN_H /* Define the default attributes for the functions in this file. */ + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vpopcntdq,avx512vl,no-evex512"), \ + __min_vector_width__(128))) constexpr +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vpopcntdq,avx512vl,no-evex512"), \ + __min_vector_width__(256))) constexpr +#else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vpopcntdq,avx512vl,no-evex512"), \ @@ -24,17 +35,9 @@ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vpopcntdq,avx512vl,no-evex512"), \ __min_vector_width__(256))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr -#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr -#else -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 -#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 #endif -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_popcnt_epi64(__m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_popcnt_epi64(__m128i __A) { return (__m128i)__builtin_elementwise_popcount((__v2du)__A); } @@ -49,8 +52,7 @@ _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_popcnt_epi32(__m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_popcnt_epi32(__m128i __A) { return (__m128i)__builtin_elementwise_popcount((__v4su)__A); } @@ -65,7 +67,7 @@ _mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) { return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_popcnt_epi64(__m256i __A) { return (__m256i)__builtin_elementwise_popcount((__v4du)__A); } @@ -81,7 +83,7 @@ _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_popcnt_epi32(__m256i __A) { return (__m256i)__builtin_elementwise_popcount((__v8su)__A); } diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h index 8e497a9..26096da9 100644 --- a/clang/lib/Headers/avxintrin.h +++ b/clang/lib/Headers/avxintrin.h @@ -70,8 +70,8 @@ typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32))); #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr #else -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128 -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 #endif /* Arithmetic */ @@ -87,9 +87,8 @@ typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32))); /// A 256-bit vector of [4 x double] containing one of the source operands. /// \returns A 256-bit vector of [4 x double] containing the sums of both /// operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_add_pd(__m256d __a, __m256d __b) -{ +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_add_pd(__m256d __a, __m256d __b) { return (__m256d)((__v4df)__a+(__v4df)__b); } @@ -105,9 +104,8 @@ _mm256_add_pd(__m256d __a, __m256d __b) /// A 256-bit vector of [8 x float] containing one of the source operands. /// \returns A 256-bit vector of [8 x float] containing the sums of both /// operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_add_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_add_ps(__m256 __a, + __m256 __b) { return (__m256)((__v8sf)__a+(__v8sf)__b); } @@ -123,9 +121,8 @@ _mm256_add_ps(__m256 __a, __m256 __b) /// A 256-bit vector of [4 x double] containing the subtrahend. /// \returns A 256-bit vector of [4 x double] containing the differences between /// both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_sub_pd(__m256d __a, __m256d __b) -{ +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_sub_pd(__m256d __a, __m256d __b) { return (__m256d)((__v4df)__a-(__v4df)__b); } @@ -141,9 +138,8 @@ _mm256_sub_pd(__m256d __a, __m256d __b) /// A 256-bit vector of [8 x float] containing the subtrahend. /// \returns A 256-bit vector of [8 x float] containing the differences between /// both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_sub_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a, + __m256 __b) { return (__m256)((__v8sf)__a-(__v8sf)__b); } @@ -197,9 +193,8 @@ _mm256_addsub_ps(__m256 __a, __m256 __b) /// A 256-bit vector of [4 x double] containing the divisor. /// \returns A 256-bit vector of [4 x double] containing the quotients of both /// operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_div_pd(__m256d __a, __m256d __b) -{ +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_div_pd(__m256d __a, __m256d __b) { return (__m256d)((__v4df)__a/(__v4df)__b); } @@ -215,9 +210,8 @@ _mm256_div_pd(__m256d __a, __m256d __b) /// A 256-bit vector of [8 x float] containing the divisor. /// \returns A 256-bit vector of [8 x float] containing the quotients of both /// operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_div_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_div_ps(__m256 __a, + __m256 __b) { return (__m256)((__v8sf)__a/(__v8sf)__b); } @@ -317,9 +311,8 @@ _mm256_min_ps(__m256 __a, __m256 __b) /// A 256-bit vector of [4 x double] containing one of the operands. /// \returns A 256-bit vector of [4 x double] containing the products of both /// operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_mul_pd(__m256d __a, __m256d __b) -{ +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_mul_pd(__m256d __a, __m256d __b) { return (__m256d)((__v4df)__a * (__v4df)__b); } @@ -335,9 +328,8 @@ _mm256_mul_pd(__m256d __a, __m256d __b) /// A 256-bit vector of [8 x float] containing one of the operands. /// \returns A 256-bit vector of [8 x float] containing the products of both /// operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_mul_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a, + __m256 __b) { return (__m256)((__v8sf)__a * (__v8sf)__b); } @@ -555,7 +547,7 @@ _mm256_rcp_ps(__m256 __a) /// A 256-bit vector of [4 x double] containing one of the source operands. /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the /// values between both operands. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_pd(__m256d __a, __m256d __b) { return (__m256d)((__v4du)__a & (__v4du)__b); @@ -573,7 +565,7 @@ _mm256_and_pd(__m256d __a, __m256d __b) /// A 256-bit vector of [8 x float] containing one of the source operands. /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the /// values between both operands. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_and_ps(__m256 __a, __m256 __b) { return (__m256)((__v8su)__a & (__v8su)__b); @@ -594,7 +586,7 @@ _mm256_and_ps(__m256 __a, __m256 __b) /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the /// values of the second operand and the one's complement of the first /// operand. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_pd(__m256d __a, __m256d __b) { return (__m256d)(~(__v4du)__a & (__v4du)__b); @@ -615,7 +607,7 @@ _mm256_andnot_pd(__m256d __a, __m256d __b) /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the /// values of the second operand and the one's complement of the first /// operand. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_andnot_ps(__m256 __a, __m256 __b) { return (__m256)(~(__v8su)__a & (__v8su)__b); @@ -633,7 +625,7 @@ _mm256_andnot_ps(__m256 __a, __m256 __b) /// A 256-bit vector of [4 x double] containing one of the source operands. /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the /// values between both operands. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_pd(__m256d __a, __m256d __b) { return (__m256d)((__v4du)__a | (__v4du)__b); @@ -651,7 +643,7 @@ _mm256_or_pd(__m256d __a, __m256d __b) /// A 256-bit vector of [8 x float] containing one of the source operands. /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the /// values between both operands. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_or_ps(__m256 __a, __m256 __b) { return (__m256)((__v8su)__a | (__v8su)__b); @@ -669,7 +661,7 @@ _mm256_or_ps(__m256 __a, __m256 __b) /// A 256-bit vector of [4 x double] containing one of the source operands. /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the /// values between both operands. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_pd(__m256d __a, __m256d __b) { return (__m256d)((__v4du)__a ^ (__v4du)__b); @@ -687,7 +679,7 @@ _mm256_xor_pd(__m256d __a, __m256d __b) /// A 256-bit vector of [8 x float] containing one of the source operands. /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the /// values between both operands. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_xor_ps(__m256 __a, __m256 __b) { return (__m256)((__v8su)__a ^ (__v8su)__b); @@ -2190,9 +2182,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param __a /// A 128-bit integer vector of [4 x i32]. /// \returns A 256-bit vector of [4 x double] containing the converted values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_cvtepi32_pd(__m128i __a) -{ +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_cvtepi32_pd(__m128i __a) { return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); } @@ -2205,9 +2196,8 @@ _mm256_cvtepi32_pd(__m128i __a) /// \param __a /// A 256-bit integer vector. /// \returns A 256-bit vector of [8 x float] containing the converted values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_cvtepi32_ps(__m256i __a) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_cvtepi32_ps(__m256i __a) { return (__m256)__builtin_convertvector((__v8si)__a, __v8sf); } @@ -2256,9 +2246,8 @@ _mm256_cvtps_epi32(__m256 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 256-bit vector of [4 x double] containing the converted values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_cvtps_pd(__m128 __a) -{ +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_cvtps_pd(__m128 __a) { return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); } @@ -2392,7 +2381,7 @@ _mm256_cvtss_f32(__m256 __a) /// return value. /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated /// values. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movehdup_ps(__m256 __a) { return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); @@ -2417,7 +2406,7 @@ _mm256_movehdup_ps(__m256 __a) /// return value. /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated /// values. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_moveldup_ps(__m256 __a) { return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); @@ -2439,7 +2428,7 @@ _mm256_moveldup_ps(__m256 __a) /// the return value. /// \returns A 256-bit vector of [4 x double] containing the moved and /// duplicated values. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_movedup_pd(__m256d __a) { return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); @@ -2462,9 +2451,8 @@ _mm256_movedup_pd(__m256d __a) /// Bits [127:64] are written to bits [127:64] of the return value. \n /// Bits [255:192] are written to bits [255:192] of the return value. \n /// \returns A 256-bit vector of [4 x double] containing the interleaved values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_unpackhi_pd(__m256d __a, __m256d __b) -{ +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_unpackhi_pd(__m256d __a, __m256d __b) { return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); } @@ -2484,9 +2472,8 @@ _mm256_unpackhi_pd(__m256d __a, __m256d __b) /// Bits [63:0] are written to bits [127:64] of the return value. \n /// Bits [191:128] are written to bits [255:192] of the return value. \n /// \returns A 256-bit vector of [4 x double] containing the interleaved values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_unpacklo_pd(__m256d __a, __m256d __b) -{ +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_unpacklo_pd(__m256d __a, __m256d __b) { return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); } @@ -2511,9 +2498,8 @@ _mm256_unpacklo_pd(__m256d __a, __m256d __b) /// Bits [223:192] are written to bits [191:160] of the return value. \n /// Bits [255:224] are written to bits [255:224] of the return value. /// \returns A 256-bit vector of [8 x float] containing the interleaved values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_unpackhi_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_unpackhi_ps(__m256 __a, __m256 __b) { return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); } @@ -2538,9 +2524,8 @@ _mm256_unpackhi_ps(__m256 __a, __m256 __b) /// Bits [159:128] are written to bits [191:160] of the return value. \n /// Bits [191:160] are written to bits [255:224] of the return value. /// \returns A 256-bit vector of [8 x float] containing the interleaved values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_unpacklo_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_unpacklo_ps(__m256 __a, __m256 __b) { return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); } @@ -3777,7 +3762,7 @@ _mm256_set_ps(float __a, float __b, float __c, float __d, /// \param __i7 /// A 32-bit integral value used to initialize bits [31:0] of the result. /// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7) { @@ -3825,7 +3810,7 @@ _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, /// \param __w00 /// A 16-bit integral value used to initialize bits [15:0] of the result. /// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, @@ -3908,7 +3893,7 @@ _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, /// \param __b00 /// An 8-bit integral value used to initialize bits [7:0] of the result. /// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, @@ -3943,7 +3928,7 @@ _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, /// \param __d /// A 64-bit integral value used to initialize bits [63:0] of the result. /// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) { return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a }; @@ -4044,7 +4029,7 @@ _mm256_setr_ps(float __a, float __b, float __c, float __d, /// \param __i7 /// A 32-bit integral value used to initialize bits [255:224] of the result. /// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7) { @@ -4092,7 +4077,7 @@ _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, /// \param __w00 /// A 16-bit integral value used to initialize bits [255:240] of the result. /// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, short __w07, short __w06, short __w05, short __w04, @@ -4177,7 +4162,7 @@ _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, /// \param __b00 /// An 8-bit integral value used to initialize bits [255:248] of the result. /// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, char __b23, char __b22, char __b21, char __b20, @@ -4210,7 +4195,7 @@ _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, /// \param __d /// A 64-bit integral value used to initialize bits [255:192] of the result. /// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) { return _mm256_set_epi64x(__d, __c, __b, __a); @@ -4267,7 +4252,7 @@ _mm256_set1_ps(float __w) /// A 32-bit integral value used to initialize each vector element of the /// result. /// \returns An initialized 256-bit integer vector of [8 x i32]. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi32(int __i) { return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i); @@ -4285,7 +4270,7 @@ _mm256_set1_epi32(int __i) /// A 16-bit integral value used to initialize each vector element of the /// result. /// \returns An initialized 256-bit integer vector of [16 x i16]. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi16(short __w) { return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w, @@ -4303,7 +4288,7 @@ _mm256_set1_epi16(short __w) /// An 8-bit integral value used to initialize each vector element of the /// result. /// \returns An initialized 256-bit integer vector of [32 x i8]. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi8(char __b) { return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, @@ -4324,7 +4309,7 @@ _mm256_set1_epi8(char __b) /// A 64-bit integral value used to initialize each vector element of the /// result. /// \returns An initialized 256-bit integer vector of [4 x i64]. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_epi64x(long long __q) { return _mm256_set_epi64x(__q, __q, __q, __q); @@ -4379,7 +4364,7 @@ _mm256_setzero_si256(void) { /// A 256-bit floating-point vector of [4 x double]. /// \returns A 256-bit floating-point vector of [8 x float] containing the same /// bitwise pattern as the parameter. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_ps(__m256d __a) { return (__m256)__a; @@ -4396,7 +4381,7 @@ _mm256_castpd_ps(__m256d __a) /// A 256-bit floating-point vector of [4 x double]. /// \returns A 256-bit integer vector containing the same bitwise pattern as the /// parameter. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd_si256(__m256d __a) { return (__m256i)__a; @@ -4413,7 +4398,7 @@ _mm256_castpd_si256(__m256d __a) /// A 256-bit floating-point vector of [8 x float]. /// \returns A 256-bit floating-point vector of [4 x double] containing the same /// bitwise pattern as the parameter. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_pd(__m256 __a) { return (__m256d)__a; @@ -4430,7 +4415,7 @@ _mm256_castps_pd(__m256 __a) /// A 256-bit floating-point vector of [8 x float]. /// \returns A 256-bit integer vector containing the same bitwise pattern as the /// parameter. -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps_si256(__m256 __a) { return (__m256i)__a; @@ -4447,7 +4432,7 @@ _mm256_castps_si256(__m256 __a) /// A 256-bit integer vector. /// \returns A 256-bit floating-point vector of [8 x float] containing the same /// bitwise pattern as the parameter. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_ps(__m256i __a) { return (__m256)__a; @@ -4464,7 +4449,7 @@ _mm256_castsi256_ps(__m256i __a) /// A 256-bit integer vector. /// \returns A 256-bit floating-point vector of [4 x double] containing the same /// bitwise pattern as the parameter. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_pd(__m256i __a) { return (__m256d)__a; @@ -4481,7 +4466,7 @@ _mm256_castsi256_pd(__m256i __a) /// A 256-bit floating-point vector of [4 x double]. /// \returns A 128-bit floating-point vector of [2 x double] containing the /// lower 128 bits of the parameter. -static __inline __m128d __DEFAULT_FN_ATTRS +static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castpd256_pd128(__m256d __a) { return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); @@ -4498,7 +4483,7 @@ _mm256_castpd256_pd128(__m256d __a) /// A 256-bit floating-point vector of [8 x float]. /// \returns A 128-bit floating-point vector of [4 x float] containing the /// lower 128 bits of the parameter. -static __inline __m128 __DEFAULT_FN_ATTRS +static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castps256_ps128(__m256 __a) { return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); @@ -4514,7 +4499,7 @@ _mm256_castps256_ps128(__m256 __a) /// A 256-bit integer vector. /// \returns A 128-bit integer vector containing the lower 128 bits of the /// parameter. -static __inline __m128i __DEFAULT_FN_ATTRS +static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_castsi256_si128(__m256i __a) { return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h index 52addb7..ce8c79e 100644 --- a/clang/lib/Headers/cpuid.h +++ b/clang/lib/Headers/cpuid.h @@ -345,10 +345,15 @@ static __inline int __get_cpuid_count (unsigned int __leaf, // In some configurations, __cpuidex is defined as a builtin (primarily // -fms-extensions) which will conflict with the __cpuidex definition below. #if !(__has_builtin(__cpuidex)) +// In some cases, offloading will set the host as the aux triple and define the +// builtin. Given __has_builtin does not detect builtins on aux triples, we need +// to explicitly check for some offloading cases. +#ifndef __NVPTX__ static __inline void __cpuidex(int __cpu_info[4], int __leaf, int __subleaf) { __cpuid_count(__leaf, __subleaf, __cpu_info[0], __cpu_info[1], __cpu_info[2], __cpu_info[3]); } #endif +#endif #endif /* __CPUID_H */ diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index 78e8a42..e15a260 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -17,7 +17,6 @@ #include <xmmintrin.h> typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); -typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); typedef long long __m128i_u @@ -25,13 +24,9 @@ typedef long long __m128i_u /* Type defines. */ typedef double __v2df __attribute__((__vector_size__(16))); -typedef long long __v2di __attribute__((__vector_size__(16))); -typedef short __v8hi __attribute__((__vector_size__(16))); -typedef char __v16qi __attribute__((__vector_size__(16))); /* Unsigned types */ typedef unsigned long long __v2du __attribute__((__vector_size__(16))); -typedef unsigned short __v8hu __attribute__((__vector_size__(16))); typedef unsigned char __v16qu __attribute__((__vector_size__(16))); /* We need an explicitly signed variant for char. Note that this shouldn't @@ -67,6 +62,9 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); #define __trunc64(x) \ (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __zext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, 2, 3) #define __anyext128(x) \ (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ 1, -1, -1) @@ -2127,8 +2125,9 @@ _mm_add_epi32(__m128i __a, __m128i __b) { /// \param __b /// A 64-bit integer. /// \returns A 64-bit integer containing the sum of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) { - return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_si64(__m64 __a, + __m64 __b) { + return (__m64)(((__v1du)__a)[0] + ((__v1du)__b)[0]); } /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], @@ -2169,8 +2168,8 @@ _mm_add_epi64(__m128i __a, __m128i __b) { /// A 128-bit signed [16 x i8] vector. /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of /// both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_adds_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b); } @@ -2191,8 +2190,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, /// A 128-bit signed [8 x i16] vector. /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of /// both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_adds_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b); } @@ -2213,8 +2212,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, /// A 128-bit unsigned [16 x i8] vector. /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums /// of both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_adds_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b); } @@ -2235,8 +2234,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, /// A 128-bit unsigned [8 x i16] vector. /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums /// of both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_adds_epu16(__m128i __a, __m128i __b) { return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b); } @@ -2393,8 +2392,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, /// A 128-bit signed [8 x i16] vector. /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of /// each of the eight 32-bit products. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mulhi_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); } @@ -2412,9 +2411,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, /// A 128-bit unsigned [8 x i16] vector. /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits /// of each of the eight 32-bit products. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, - __m128i __b) { - return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mulhi_epu16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_pmulhuw128((__v8hu)__a, (__v8hu)__b); } /// Multiplies the corresponding elements of two signed [8 x i16] @@ -2431,8 +2430,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, /// A 128-bit signed [8 x i16] vector. /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of /// each of the eight 32-bit products. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mullo_epi16(__m128i __a, __m128i __b) { return (__m128i)((__v8hu)__a * (__v8hu)__b); } @@ -2449,9 +2448,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, /// \param __b /// A 64-bit integer containing one of the source operands. /// \returns A 64-bit integer vector containing the product of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) { - return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a), - (__v4si)__anyext128(__b))); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_su32(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_pmuludq128((__v4si)__zext128(__a), + (__v4si)__zext128(__b))); } /// Multiplies 32-bit unsigned integer values contained in the lower @@ -2467,8 +2467,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) { /// \param __b /// A [2 x i64] vector containing one of the source operands. /// \returns A [2 x i64] vector containing the product of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mul_epu32(__m128i __a, __m128i __b) { return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); } @@ -2557,8 +2557,9 @@ _mm_sub_epi32(__m128i __a, __m128i __b) { /// A 64-bit integer vector containing the subtrahend. /// \returns A 64-bit integer vector containing the difference of the values in /// the operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) { - return (__m64)((unsigned long long)__a - (unsigned long long)__b); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_si64(__m64 __a, + __m64 __b) { + return (__m64)(((__v1du)__a)[0] - ((__v1du)__b)[0]); } /// Subtracts the corresponding elements of two [2 x i64] vectors. @@ -2595,8 +2596,8 @@ _mm_sub_epi64(__m128i __a, __m128i __b) { /// A 128-bit integer vector containing the subtrahends. /// \returns A 128-bit integer vector containing the differences of the values /// in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_subs_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b); } @@ -2617,8 +2618,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, /// A 128-bit integer vector containing the subtrahends. /// \returns A 128-bit integer vector containing the differences of the values /// in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_subs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b); } @@ -2638,8 +2639,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, /// A 128-bit integer vector containing the subtrahends. /// \returns A 128-bit integer vector containing the unsigned integer /// differences of the values in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_subs_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b); } @@ -2659,8 +2660,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, /// A 128-bit integer vector containing the subtrahends. /// \returns A 128-bit integer vector containing the unsigned integer /// differences of the values in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_subs_epu16(__m128i __a, __m128i __b) { return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b); } @@ -2676,8 +2677,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, /// A 128-bit integer vector containing one of the source operands. /// \returns A 128-bit integer vector containing the bitwise AND of the values /// in both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_and_si128(__m128i __a, __m128i __b) { return (__m128i)((__v2du)__a & (__v2du)__b); } @@ -2695,8 +2696,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, /// A 128-bit vector containing the right source operand. /// \returns A 128-bit integer vector containing the bitwise AND of the one's /// complement of the first operand and the values in the second operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_andnot_si128(__m128i __a, __m128i __b) { return (__m128i)(~(__v2du)__a & (__v2du)__b); } /// Performs a bitwise OR of two 128-bit integer vectors. @@ -2711,8 +2712,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, /// A 128-bit integer vector containing one of the source operands. /// \returns A 128-bit integer vector containing the bitwise OR of the values /// in both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_or_si128(__m128i __a, __m128i __b) { return (__m128i)((__v2du)__a | (__v2du)__b); } @@ -2728,8 +2729,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, /// A 128-bit integer vector containing one of the source operands. /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the /// values in both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_xor_si128(__m128i __a, __m128i __b) { return (__m128i)((__v2du)__a ^ (__v2du)__b); } @@ -2771,8 +2772,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, /// An integer value specifying the number of bits to left-shift each value /// in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, - int __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_slli_epi16(__m128i __a, int __count) { return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); } @@ -2807,8 +2808,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, /// An integer value specifying the number of bits to left-shift each value /// in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, - int __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_slli_epi32(__m128i __a, int __count) { return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); } @@ -2843,8 +2844,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, /// An integer value specifying the number of bits to left-shift each value /// in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, - int __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_slli_epi64(__m128i __a, int __count) { return __builtin_ia32_psllqi128((__v2di)__a, __count); } @@ -2880,8 +2881,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, /// An integer value specifying the number of bits to right-shift each value /// in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, - int __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srai_epi16(__m128i __a, int __count) { return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); } @@ -2918,8 +2919,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, /// An integer value specifying the number of bits to right-shift each value /// in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, - int __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srai_epi32(__m128i __a, int __count) { return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); } @@ -2980,8 +2981,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, /// An integer value specifying the number of bits to right-shift each value /// in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, - int __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srli_epi16(__m128i __a, int __count) { return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); } @@ -3016,8 +3017,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, /// An integer value specifying the number of bits to right-shift each value /// in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, - int __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srli_epi32(__m128i __a, int __count) { return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); } @@ -3052,8 +3053,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, /// An integer value specifying the number of bits to right-shift each value /// in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, - int __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srli_epi64(__m128i __a, int __count) { return __builtin_ia32_psrlqi128((__v2di)__a, __count); } @@ -3089,8 +3090,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, /// \param __b /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmpeq_epi8(__m128i __a, __m128i __b) { return (__m128i)((__v16qi)__a == (__v16qi)__b); } @@ -3108,8 +3109,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, /// \param __b /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmpeq_epi16(__m128i __a, __m128i __b) { return (__m128i)((__v8hi)__a == (__v8hi)__b); } @@ -3127,8 +3128,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, /// \param __b /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmpeq_epi32(__m128i __a, __m128i __b) { return (__m128i)((__v4si)__a == (__v4si)__b); } @@ -3147,8 +3148,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, /// \param __b /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmpgt_epi8(__m128i __a, __m128i __b) { /* This function always performs a signed comparison, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m128i)((__v16qs)__a > (__v16qs)__b); @@ -3169,8 +3170,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, /// \param __b /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmpgt_epi16(__m128i __a, __m128i __b) { return (__m128i)((__v8hi)__a > (__v8hi)__b); } @@ -3189,8 +3190,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, /// \param __b /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmpgt_epi32(__m128i __a, __m128i __b) { return (__m128i)((__v4si)__a > (__v4si)__b); } @@ -3209,8 +3210,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, /// \param __b /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmplt_epi8(__m128i __a, __m128i __b) { return _mm_cmpgt_epi8(__b, __a); } @@ -3229,8 +3230,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, /// \param __b /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmplt_epi16(__m128i __a, __m128i __b) { return _mm_cmpgt_epi16(__b, __a); } @@ -3249,8 +3250,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, /// \param __b /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmplt_epi32(__m128i __a, __m128i __b) { return _mm_cmpgt_epi32(__b, __a); } @@ -3379,7 +3380,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) { /// \param __a /// A 32-bit signed integer operand. /// \returns A 128-bit vector of [4 x i32]. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtsi32_si128(int __a) { return __extension__(__m128i)(__v4si){__a, 0, 0, 0}; } @@ -3394,7 +3396,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) { /// \param __a /// A 64-bit signed integer operand containing the value to be converted. /// \returns A 128-bit vector of [2 x i64] containing the converted value. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtsi64_si128(long long __a) { return __extension__(__m128i)(__v2di){__a, 0}; } @@ -3409,7 +3412,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) { /// A vector of [4 x i32]. The least significant 32 bits are moved to the /// destination. /// \returns A 32-bit signed integer containing the moved value. -static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) { +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtsi128_si32(__m128i __a) { __v4si __b = (__v4si)__a; return __b[0]; } @@ -3425,7 +3429,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) { /// A vector of [2 x i64]. The least significant 64 bits are moved to the /// destination. /// \returns A 64-bit signed integer containing the moved value. -static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) { +static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtsi128_si64(__m128i __a) { return __a[0]; } @@ -4415,8 +4420,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) { /// Bits [119:112] are written to bits [111:104] of the result. \n /// Bits [127:120] are written to bits [127:120] of the result. /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpackhi_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector( (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11, 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15); @@ -4443,8 +4448,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, /// Bits [111:96] are written to bits [95:80] of the result. \n /// Bits [127:112] are written to bits [127:112] of the result. /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpackhi_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5, 8 + 5, 6, 8 + 6, 7, 8 + 7); } @@ -4466,8 +4471,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, /// Bits [95:64] are written to bits [64:32] of the destination. \n /// Bits [127:96] are written to bits [127:96] of the destination. /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpackhi_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3, 4 + 3); } @@ -4487,8 +4492,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, /// A 128-bit vector of [2 x i64]. \n /// Bits [127:64] are written to bits [127:64] of the destination. /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpackhi_epi64(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1); } @@ -4521,8 +4526,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, /// Bits [55:48] are written to bits [111:104] of the result. \n /// Bits [63:56] are written to bits [127:120] of the result. /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpacklo_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector( (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4, 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7); @@ -4550,8 +4555,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, /// Bits [47:32] are written to bits [95:80] of the result. \n /// Bits [63:48] are written to bits [127:112] of the result. /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpacklo_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1, 8 + 1, 2, 8 + 2, 3, 8 + 3); } @@ -4573,8 +4578,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, /// Bits [31:0] are written to bits [64:32] of the destination. \n /// Bits [63:32] are written to bits [127:96] of the destination. /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpacklo_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1, 4 + 1); } @@ -4594,8 +4599,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, /// A 128-bit vector of [2 x i64]. \n /// Bits [63:0] are written to bits [127:64] of the destination. \n /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpacklo_epi64(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0); } diff --git a/clang/lib/Headers/f16cintrin.h b/clang/lib/Headers/f16cintrin.h index 94a662c..ede67af 100644 --- a/clang/lib/Headers/f16cintrin.h +++ b/clang/lib/Headers/f16cintrin.h @@ -38,9 +38,7 @@ static __inline float __DEFAULT_FN_ATTRS128 _cvtsh_ss(unsigned short __a) { - __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0}; - __v4sf __r = __builtin_ia32_vcvtph2ps(__v); - return __r[0]; + return (float)__builtin_bit_cast(__fp16, __a); } /// Converts a 32-bit single-precision float value to a 16-bit @@ -109,7 +107,10 @@ _cvtsh_ss(unsigned short __a) static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_cvtph_ps(__m128i __a) { - return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a); + typedef __fp16 __v4fp16 __attribute__((__vector_size__(8))); + + __v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3); + return (__m128) __builtin_convertvector((__v4fp16)__v, __v4sf); } /// Converts a 256-bit vector of [8 x float] into a 128-bit vector @@ -153,7 +154,9 @@ _mm_cvtph_ps(__m128i __a) static __inline __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtph_ps(__m128i __a) { - return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a); + typedef __fp16 __v8fp16 __attribute__((__vector_size__(16), __aligned__(16))); + + return (__m256) __builtin_convertvector((__v8fp16)__a, __v8sf); } #undef __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h index 694801b..e0a0e4c 100644 --- a/clang/lib/Headers/fma4intrin.h +++ b/clang/lib/Headers/fma4intrin.h @@ -20,16 +20,24 @@ #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256))) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B, + (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -44,16 +52,16 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B, + -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -68,16 +76,16 @@ _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, + (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -92,16 +100,16 @@ _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, + -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -140,52 +148,52 @@ _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B, + (__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, + -(__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B, + -(__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, + (__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, + -(__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, + -(__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 @@ -214,5 +222,7 @@ _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif /* __FMA4INTRIN_H */ diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h index 22d1a78..d8ea489 100644 --- a/clang/lib/Headers/fmaintrin.h +++ b/clang/lib/Headers/fmaintrin.h @@ -18,6 +18,14 @@ #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + /// Computes a multiply-add of 128-bit vectors of [4 x float]. /// For each element, computes <c> (__A * __B) + __C </c>. /// @@ -32,10 +40,11 @@ /// \param __C /// A 128-bit vector of [4 x float] containing the addend. /// \returns A 128-bit vector of [4 x float] containing the result. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); } /// Computes a multiply-add of 128-bit vectors of [2 x double]. @@ -52,10 +61,11 @@ _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) /// \param __C /// A 128-bit vector of [2 x double] containing the addend. /// \returns A 128-bit [2 x double] vector containing the result. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B, + (__v2df)__C); } /// Computes a scalar multiply-add of the single-precision values in the @@ -130,10 +140,11 @@ _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) /// \param __C /// A 128-bit vector of [4 x float] containing the subtrahend. /// \returns A 128-bit vector of [4 x float] containing the result. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); + return (__m128)__builtin_elementwise_fma((__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); } /// Computes a multiply-subtract of 128-bit vectors of [2 x double]. @@ -150,10 +161,11 @@ _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) /// \param __C /// A 128-bit vector of [2 x double] containing the addend. /// \returns A 128-bit vector of [2 x double] containing the result. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); + return (__m128d)__builtin_elementwise_fma((__v2df)__A, (__v2df)__B, + -(__v2df)__C); } /// Computes a scalar multiply-subtract of the single-precision values in @@ -228,10 +240,11 @@ _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) /// \param __C /// A 128-bit vector of [4 x float] containing the addend. /// \returns A 128-bit [4 x float] vector containing the result. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); } /// Computes a negated multiply-add of 128-bit vectors of [2 x double]. @@ -248,10 +261,11 @@ _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) /// \param __C /// A 128-bit vector of [2 x double] containing the addend. /// \returns A 128-bit vector of [2 x double] containing the result. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, + (__v2df)__C); } /// Computes a scalar negated multiply-add of the single-precision values in @@ -326,10 +340,11 @@ _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) /// \param __C /// A 128-bit vector of [4 x float] containing the subtrahend. /// \returns A 128-bit vector of [4 x float] containing the result. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); + return (__m128)__builtin_elementwise_fma(-(__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); } /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double]. @@ -346,10 +361,11 @@ _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) /// \param __C /// A 128-bit vector of [2 x double] containing the subtrahend. /// \returns A 128-bit vector of [2 x double] containing the result. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); + return (__m128d)__builtin_elementwise_fma(-(__v2df)__A, (__v2df)__B, + -(__v2df)__C); } /// Computes a scalar negated multiply-subtract of the single-precision @@ -528,10 +544,11 @@ _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) /// \param __C /// A 256-bit vector of [8 x float] containing the addend. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); } /// Computes a multiply-add of 256-bit vectors of [4 x double]. @@ -548,10 +565,11 @@ _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) /// \param __C /// A 256-bit vector of [4 x double] containing the addend. /// \returns A 256-bit vector of [4 x double] containing the result. -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B, + (__v4df)__C); } /// Computes a multiply-subtract of 256-bit vectors of [8 x float]. @@ -568,10 +586,11 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) /// \param __C /// A 256-bit vector of [8 x float] containing the subtrahend. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); + return (__m256)__builtin_elementwise_fma((__v8sf)__A, (__v8sf)__B, + -(__v8sf)__C); } /// Computes a multiply-subtract of 256-bit vectors of [4 x double]. @@ -588,10 +607,11 @@ _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) /// \param __C /// A 256-bit vector of [4 x double] containing the subtrahend. /// \returns A 256-bit vector of [4 x double] containing the result. -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); + return (__m256d)__builtin_elementwise_fma((__v4df)__A, (__v4df)__B, + -(__v4df)__C); } /// Computes a negated multiply-add of 256-bit vectors of [8 x float]. @@ -608,10 +628,11 @@ _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) /// \param __C /// A 256-bit vector of [8 x float] containing the addend. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); } /// Computes a negated multiply-add of 256-bit vectors of [4 x double]. @@ -628,10 +649,11 @@ _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) /// \param __C /// A 256-bit vector of [4 x double] containing the addend. /// \returns A 256-bit vector of [4 x double] containing the result. -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, + (__v4df)__C); } /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float]. @@ -648,10 +670,11 @@ _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) /// \param __C /// A 256-bit vector of [8 x float] containing the subtrahend. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); + return (__m256)__builtin_elementwise_fma(-(__v8sf)__A, (__v8sf)__B, + -(__v8sf)__C); } /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double]. @@ -668,10 +691,11 @@ _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) /// \param __C /// A 256-bit vector of [4 x double] containing the subtrahend. /// \returns A 256-bit vector of [4 x double] containing the result. -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); + return (__m256d)__builtin_elementwise_fma(-(__v4df)__A, (__v4df)__B, + -(__v4df)__C); } /// Computes a multiply with alternating add/subtract of 256-bit vectors of @@ -792,5 +816,7 @@ _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif /* __FMAINTRIN_H */ diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h index cbc518d..21a9c30 100644 --- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h @@ -2190,10 +2190,10 @@ float4 sqrt(float4); // step builtins //===----------------------------------------------------------------------===// -/// \fn T step(T x, T y) +/// \fn T step(T y, T x) /// \brief Returns 1 if the x parameter is greater than or equal to the y -/// parameter; otherwise, 0. vector. \param x [in] The first floating-point -/// value to compare. \param y [in] The first floating-point value to compare. +/// parameter; otherwise, 0. vector. \param y [in] The first floating-point +/// value to compare. \param x [in] The second floating-point value to compare. /// /// Step is based on the following formula: (x >= y) ? 1 : 0 diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index dc0fa5c..4ed95c5 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -57,6 +57,9 @@ typedef char __v16qi __attribute__((__vector_size__(16))); #define __trunc64(x) \ (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __zext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, 2, 3) #define __anyext128(x) \ (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ 1, -1, -1) @@ -85,7 +88,7 @@ _mm_empty(void) { /// A 32-bit integer value. /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the /// parameter. The upper 32 bits are set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtsi32_si64(int __i) { return __extension__ (__m64)(__v2si){__i, 0}; @@ -102,7 +105,7 @@ _mm_cvtsi32_si64(int __i) /// A 64-bit integer vector. /// \returns A 32-bit signed integer value containing the lower 32 bits of the /// parameter. -static __inline__ int __DEFAULT_FN_ATTRS_SSE2 +static __inline__ int __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtsi64_si32(__m64 __m) { return ((__v2si)__m)[0]; @@ -118,10 +121,10 @@ _mm_cvtsi64_si32(__m64 __m) /// A 64-bit signed integer. /// \returns A 64-bit integer vector containing the same bitwise pattern as the /// parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtsi64_m64(long long __i) { - return (__m64)__i; + return __extension__ (__m64)(__v1di){__i}; } /// Casts a 64-bit integer vector into a 64-bit signed integer value. @@ -134,10 +137,10 @@ _mm_cvtsi64_m64(long long __i) /// A 64-bit integer vector. /// \returns A 64-bit signed integer containing the same bitwise pattern as the /// parameter. -static __inline__ long long __DEFAULT_FN_ATTRS_SSE2 +static __inline__ long long __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtm64_si64(__m64 __m) { - return (long long)__m; + return ((__v1di)__m)[0]; } /// Converts, with saturation, 16-bit signed integers from both 64-bit integer @@ -239,11 +242,10 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2) /// Bits [63:56] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, - 4, 12, 5, 13, 6, 14, 7, 15); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 4, 12, 5, + 13, 6, 14, 7, 15); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of @@ -263,11 +265,9 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) /// Bits [63:48] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, - 2, 6, 3, 7); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 2, 6, 3, 7); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of @@ -285,10 +285,9 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] @@ -312,11 +311,10 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) /// Bits [31:24] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, - 0, 8, 1, 9, 2, 10, 3, 11); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 0, 8, 1, 9, + 2, 10, 3, 11); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of @@ -336,11 +334,9 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) /// Bits [31:16] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, - 0, 4, 1, 5); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 0, 4, 1, 5); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of @@ -358,10 +354,9 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2); } /// Adds each 8-bit integer element of the first 64-bit integer vector @@ -379,7 +374,7 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_add_pi8(__m64 __m1, __m64 __m2) { return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2)); @@ -400,7 +395,7 @@ _mm_add_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_add_pi16(__m64 __m1, __m64 __m2) { return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2)); @@ -421,7 +416,7 @@ _mm_add_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_add_pi32(__m64 __m1, __m64 __m2) { return (__m64)(((__v2su)__m1) + ((__v2su)__m2)); @@ -445,10 +440,9 @@ _mm_add_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_adds_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_adds_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2); } /// Adds, with saturation, each 16-bit signed integer element of the first @@ -469,10 +463,9 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_adds_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_adds_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2); } /// Adds, with saturation, each 8-bit unsigned integer element of the first @@ -492,10 +485,9 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_adds_pu8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_adds_pu8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2); } /// Adds, with saturation, each 16-bit unsigned integer element of the first @@ -515,10 +507,9 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_adds_pu16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_adds_pu16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2); } /// Subtracts each 8-bit integer element of the second 64-bit integer @@ -536,7 +527,7 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_sub_pi8(__m64 __m1, __m64 __m2) { return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2)); @@ -557,7 +548,7 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_sub_pi16(__m64 __m1, __m64 __m2) { return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2)); @@ -578,7 +569,7 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32] containing the subtrahends. /// \returns A 64-bit integer vector of [2 x i32] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_sub_pi32(__m64 __m1, __m64 __m2) { return (__m64)(((__v2su)__m1) - ((__v2su)__m2)); @@ -602,10 +593,9 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_subs_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_subs_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2); } /// Subtracts, with saturation, each 16-bit signed integer element of the @@ -626,10 +616,9 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_subs_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_subs_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2); } /// Subtracts each 8-bit unsigned integer element of the second 64-bit @@ -650,10 +639,9 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_subs_pu8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_subs_pu8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2); } /// Subtracts each 16-bit unsigned integer element of the second 64-bit @@ -674,10 +662,9 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_subs_pu16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_subs_pu16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -723,11 +710,11 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { - return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1), - (__v8hi)__anyext128(__m2))); + return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__zext128(__m1), + (__v8hi)__zext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -745,7 +732,7 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_mullo_pi16(__m64 __m1, __m64 __m2) { return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2)); @@ -791,11 +778,9 @@ _mm_sll_pi16(__m64 __m, __m64 __count) /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted /// values. If \a __count is greater or equal to 16, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_slli_pi16(__m64 __m, int __count) -{ - return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m), - __count)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_slli_pi16(__m64 __m, int __count) { + return __trunc64(__builtin_ia32_psllwi128((__v8hi)__zext128(__m), __count)); } /// Left-shifts each 32-bit signed integer element of the first @@ -838,11 +823,9 @@ _mm_sll_pi32(__m64 __m, __m64 __count) /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted /// values. If \a __count is greater or equal to 32, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_slli_pi32(__m64 __m, int __count) -{ - return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m), - __count)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_slli_pi32(__m64 __m, int __count) { + return __trunc64(__builtin_ia32_pslldi128((__v4si)__zext128(__m), __count)); } /// Left-shifts the first 64-bit integer parameter by the number of bits @@ -880,11 +863,9 @@ _mm_sll_si64(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the left-shifted value. If /// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_slli_si64(__m64 __m, int __count) -{ - return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m), - __count)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_slli_si64(__m64 __m, int __count) { + return __trunc64(__builtin_ia32_psllqi128((__v2di)__zext128(__m), __count)); } /// Right-shifts each 16-bit integer element of the first parameter, @@ -929,11 +910,9 @@ _mm_sra_pi16(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_srai_pi16(__m64 __m, int __count) -{ - return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m), - __count)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_srai_pi16(__m64 __m, int __count) { + return __trunc64(__builtin_ia32_psrawi128((__v8hi)__zext128(__m), __count)); } /// Right-shifts each 32-bit integer element of the first parameter, @@ -978,11 +957,9 @@ _mm_sra_pi32(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_srai_pi32(__m64 __m, int __count) -{ - return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m), - __count)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_srai_pi32(__m64 __m, int __count) { + return __trunc64(__builtin_ia32_psradi128((__v4si)__zext128(__m), __count)); } /// Right-shifts each 16-bit integer element of the first parameter, @@ -1025,11 +1002,9 @@ _mm_srl_pi16(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_srli_pi16(__m64 __m, int __count) -{ - return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m), - __count)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_srli_pi16(__m64 __m, int __count) { + return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__zext128(__m), __count)); } /// Right-shifts each 32-bit integer element of the first parameter, @@ -1072,11 +1047,9 @@ _mm_srl_pi32(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_srli_pi32(__m64 __m, int __count) -{ - return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m), - __count)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_srli_pi32(__m64 __m, int __count) { + return __trunc64(__builtin_ia32_psrldi128((__v4si)__zext128(__m), __count)); } /// Right-shifts the first 64-bit integer parameter by the number of bits @@ -1115,11 +1088,9 @@ _mm_srl_si64(__m64 __m, __m64 __count) /// \param __count /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_srli_si64(__m64 __m, int __count) -{ - return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m), - __count)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_srli_si64(__m64 __m, int __count) { + return __trunc64(__builtin_ia32_psrlqi128((__v2di)__zext128(__m), __count)); } /// Performs a bitwise AND of two 64-bit integer vectors. @@ -1134,7 +1105,7 @@ _mm_srli_si64(__m64 __m, int __count) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_and_si64(__m64 __m1, __m64 __m2) { return (__m64)(((__v1du)__m1) & ((__v1du)__m2)); @@ -1155,7 +1126,7 @@ _mm_and_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of the second /// parameter and the one's complement of the first parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_andnot_si64(__m64 __m1, __m64 __m2) { return (__m64)(~((__v1du)__m1) & ((__v1du)__m2)); @@ -1173,7 +1144,7 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_or_si64(__m64 __m1, __m64 __m2) { return (__m64)(((__v1du)__m1) | ((__v1du)__m2)); @@ -1191,7 +1162,7 @@ _mm_or_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_xor_si64(__m64 __m1, __m64 __m2) { return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2)); @@ -1213,7 +1184,7 @@ _mm_xor_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2)); @@ -1235,7 +1206,7 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2)); @@ -1257,7 +1228,7 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { return (__m64)(((__v2si)__m1) == ((__v2si)__m2)); @@ -1279,7 +1250,7 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { /* This function always performs a signed comparison, but __v8qi is a char @@ -1303,7 +1274,7 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { return (__m64)((__v4hi)__m1 > (__v4hi)__m2); @@ -1325,7 +1296,7 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { return (__m64)((__v2si)__m1 > (__v2si)__m2); diff --git a/clang/lib/Headers/ptrauth.h b/clang/lib/Headers/ptrauth.h index 7f7d387..f902ca1 100644 --- a/clang/lib/Headers/ptrauth.h +++ b/clang/lib/Headers/ptrauth.h @@ -95,7 +95,7 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t; __ptrauth qualifier; the compiler will perform this check automatically. */ -#if __has_feature(ptrauth_intrinsics) +#if __has_feature(ptrauth_intrinsics) || defined(__PTRAUTH__) /* Strip the signature from a value without authenticating it. @@ -388,6 +388,6 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t; #define __ptrauth_objc_isa_uintptr #define __ptrauth_objc_super_pointer -#endif /* __has_feature(ptrauth_intrinsics) */ +#endif /* __has_feature(ptrauth_intrinsics) || defined(__PTRAUTH__) */ #endif /* __PTRAUTH_H */ diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h index bc6fe4c..f68dd7e 100644 --- a/clang/lib/Headers/smmintrin.h +++ b/clang/lib/Headers/smmintrin.h @@ -27,6 +27,12 @@ __min_vector_width__(128))) #endif +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + /* SSE4 Rounding macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 @@ -561,8 +567,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1, /// A 128-bit vector of [4 x i32]. /// \returns A 128-bit vector of [2 x i64] containing the products of both /// operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, - __m128i __V2) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mul_epi32(__m128i __V1, __m128i __V2) { return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2); } @@ -1205,8 +1211,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, /// \param __V2 /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, - __m128i __V2) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) { return (__m128i)((__v2di)__V1 == (__v2di)__V2); } @@ -1224,7 +1230,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are /// sign-extended to 16-bit values. /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepi8_epi16(__m128i __V) { /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m128i) __builtin_convertvector( @@ -1246,7 +1253,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) { /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are /// sign-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepi8_epi32(__m128i __V) { /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m128i) __builtin_convertvector( @@ -1266,7 +1274,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) { /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are /// sign-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepi8_epi64(__m128i __V) { /* This function always performs a signed extension, but __v16qi is a char which may be signed or unsigned, so use __v16qs. */ return (__m128i) __builtin_convertvector( @@ -1286,7 +1295,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) { /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are /// sign-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepi16_epi32(__m128i __V) { return (__m128i) __builtin_convertvector( __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); } @@ -1304,7 +1314,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) { /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are /// sign-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepi16_epi64(__m128i __V) { return (__m128i) __builtin_convertvector( __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); } @@ -1322,7 +1333,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) { /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are /// sign-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepi32_epi64(__m128i __V) { return (__m128i) __builtin_convertvector( __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); } @@ -1341,7 +1353,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) { /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are /// zero-extended to 16-bit values. /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepu8_epi16(__m128i __V) { return (__m128i) __builtin_convertvector( __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), @@ -1361,7 +1374,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) { /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are /// zero-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepu8_epi32(__m128i __V) { return (__m128i) __builtin_convertvector( __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); } @@ -1379,7 +1393,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) { /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are /// zero-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepu8_epi64(__m128i __V) { return (__m128i) __builtin_convertvector( __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); } @@ -1397,7 +1412,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) { /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are /// zero-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepu16_epi32(__m128i __V) { return (__m128i) __builtin_convertvector( __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); } @@ -1415,7 +1431,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) { /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are /// zero-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepu16_epi64(__m128i __V) { return (__m128i) __builtin_convertvector( __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); } @@ -1433,7 +1450,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) { /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are /// zero-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepu32_epi64(__m128i __V) { return (__m128i) __builtin_convertvector( __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); } @@ -2320,12 +2338,13 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) { /// \param __V2 /// A 128-bit integer vector. /// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, - __m128i __V2) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) { return (__m128i)((__v2di)__V1 > (__v2di)__V2); } #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #include <popcntintrin.h> diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index 371cc82..f01c61a 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -33,6 +33,12 @@ (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ 1, -1, -1) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + /// Computes the absolute value of each of the packed 8-bit signed /// integers in the source operand and stores the 8-bit unsigned integer /// results in the destination. @@ -45,9 +51,7 @@ /// A 64-bit vector of [8 x i8]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_abs_pi8(__m64 __a) -{ +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_abs_pi8(__m64 __a) { return (__m64)__builtin_elementwise_abs((__v8qs)__a); } @@ -63,10 +67,9 @@ _mm_abs_pi8(__m64 __a) /// A 128-bit vector of [16 x i8]. /// \returns A 128-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_abs_epi8(__m128i __a) -{ - return (__m128i)__builtin_elementwise_abs((__v16qs)__a); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_abs_epi8(__m128i __a) { + return (__m128i)__builtin_elementwise_abs((__v16qs)__a); } /// Computes the absolute value of each of the packed 16-bit signed @@ -81,10 +84,8 @@ _mm_abs_epi8(__m128i __a) /// A 64-bit vector of [4 x i16]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_abs_pi16(__m64 __a) -{ - return (__m64)__builtin_elementwise_abs((__v4hi)__a); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_abs_pi16(__m64 __a) { + return (__m64)__builtin_elementwise_abs((__v4hi)__a); } /// Computes the absolute value of each of the packed 16-bit signed @@ -99,10 +100,9 @@ _mm_abs_pi16(__m64 __a) /// A 128-bit vector of [8 x i16]. /// \returns A 128-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_abs_epi16(__m128i __a) -{ - return (__m128i)__builtin_elementwise_abs((__v8hi)__a); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_abs_epi16(__m128i __a) { + return (__m128i)__builtin_elementwise_abs((__v8hi)__a); } /// Computes the absolute value of each of the packed 32-bit signed @@ -117,10 +117,8 @@ _mm_abs_epi16(__m128i __a) /// A 64-bit vector of [2 x i32]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_abs_pi32(__m64 __a) -{ - return (__m64)__builtin_elementwise_abs((__v2si)__a); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_abs_pi32(__m64 __a) { + return (__m64)__builtin_elementwise_abs((__v2si)__a); } /// Computes the absolute value of each of the packed 32-bit signed @@ -135,10 +133,9 @@ _mm_abs_pi32(__m64 __a) /// A 128-bit vector of [4 x i32]. /// \returns A 128-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_abs_epi32(__m128i __a) -{ - return (__m128i)__builtin_elementwise_abs((__v4si)__a); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_abs_epi32(__m128i __a) { + return (__m128i)__builtin_elementwise_abs((__v4si)__a); } /// Concatenates the two 128-bit integer vector operands, and @@ -806,5 +803,6 @@ _mm_sign_pi32(__m64 __a, __m64 __b) #undef __anyext128 #undef __trunc64 #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif /* __TMMINTRIN_H */ diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index 6a64369..6d44cff 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -16,7 +16,6 @@ #include <mmintrin.h> -typedef int __v4si __attribute__((__vector_size__(16))); typedef float __v4sf __attribute__((__vector_size__(16))); typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); @@ -24,6 +23,7 @@ typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1))); /* Unsigned types */ typedef unsigned int __v4su __attribute__((__vector_size__(16))); +typedef unsigned short __v8hu __attribute__((__vector_size__(16))); /* This header should only be included in a hosted environment as it depends on * a standard library to provide allocation routines. */ @@ -1688,7 +1688,7 @@ _mm_cvtsi64_ss(__m128 __a, long long __b) { /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted value of the second operand. The upper 64 bits are copied from /// the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi32_ps(__m128 __a, __m64 __b) { return (__m128)__builtin_shufflevector( @@ -1714,7 +1714,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted value from the second operand. The upper 64 bits are copied /// from the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvt_pi2ps(__m128 __a, __m64 __b) { return _mm_cvtpi32_ps(__a, __b); @@ -2447,11 +2447,11 @@ _mm_movemask_pi8(__m64 __a) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_mulhi_pu16(__m64 __a, __m64 __b) { - return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a), - (__v8hi)__anyext128(__b))); + return __trunc64(__builtin_ia32_pmulhuw128((__v8hu)__zext128(__a), + (__v8hu)__zext128(__b))); } /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the @@ -2873,7 +2873,7 @@ _mm_movelh_ps(__m128 __a, __m128 __b) { /// from the corresponding elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi16_ps(__m64 __a) { return __builtin_convertvector((__v4hi)__a, __v4sf); @@ -2891,7 +2891,7 @@ _mm_cvtpi16_ps(__m64 __a) /// destination are copied from the corresponding elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpu16_ps(__m64 __a) { return __builtin_convertvector((__v4hu)__a, __v4sf); @@ -2909,7 +2909,7 @@ _mm_cvtpu16_ps(__m64 __a) /// from the corresponding lower 4 elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi8_ps(__m64 __a) { return __builtin_convertvector( @@ -2930,7 +2930,7 @@ _mm_cvtpi8_ps(__m64 __a) /// operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpu8_ps(__m64 __a) { return __builtin_convertvector( @@ -2954,7 +2954,7 @@ _mm_cvtpu8_ps(__m64 __a) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// copied and converted values from the first operand. The upper 64 bits /// contain the copied and converted values from the second operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) { return __builtin_convertvector( diff --git a/clang/lib/Headers/xopintrin.h b/clang/lib/Headers/xopintrin.h index 976cdf4..7015719 100644 --- a/clang/lib/Headers/xopintrin.h +++ b/clang/lib/Headers/xopintrin.h @@ -20,6 +20,14 @@ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(256))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) { @@ -182,13 +190,13 @@ _mm_hsubq_epi32(__m128i __A) return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C)); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C)); @@ -203,25 +211,25 @@ _mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_rot_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B); + return (__m128i)__builtin_elementwise_fshl((__v16qu)__A, (__v16qu)__A, (__v16qu)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_rot_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B); + return (__m128i)__builtin_elementwise_fshl((__v8hu)__A, (__v8hu)__A, (__v8hu)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_rot_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B); + return (__m128i)__builtin_elementwise_fshl((__v4su)__A, (__v4su)__A, (__v4su)__B); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_rot_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B); + return (__m128i)__builtin_elementwise_fshl((__v2du)__A, (__v2du)__A, (__v2du)__B); } #define _mm_roti_epi8(A, N) \ @@ -766,5 +774,7 @@ _mm256_frcz_pd(__m256d __A) #undef __DEFAULT_FN_ATTRS #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif /* __XOPINTRIN_H */ |