diff options
Diffstat (limited to 'clang/lib/Headers')
60 files changed, 2487 insertions, 2204 deletions
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 33fff76..038859a 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -119,6 +119,7 @@ set(opencl_files set(ppc_files altivec.h + amo.h ) set(ppc_htm_files diff --git a/clang/lib/Headers/__clang_cuda_complex_builtins.h b/clang/lib/Headers/__clang_cuda_complex_builtins.h index 7bc7bc2..e3038dc 100644 --- a/clang/lib/Headers/__clang_cuda_complex_builtins.h +++ b/clang/lib/Headers/__clang_cuda_complex_builtins.h @@ -23,63 +23,17 @@ #define __DEVICE__ __device__ inline #endif -// To make the algorithms available for C and C++ in CUDA and OpenMP we select -// different but equivalent function versions. TODO: For OpenMP we currently -// select the native builtins as the overload support for templates is lacking. -#if !defined(__OPENMP_NVPTX__) && !defined(__OPENMP_AMDGCN__) -#define _ISNANd std::isnan -#define _ISNANf std::isnan -#define _ISINFd std::isinf -#define _ISINFf std::isinf -#define _ISFINITEd std::isfinite -#define _ISFINITEf std::isfinite -#define _COPYSIGNd std::copysign -#define _COPYSIGNf std::copysign -#define _SCALBNd std::scalbn -#define _SCALBNf std::scalbn -#define _ABSd std::abs -#define _ABSf std::abs -#define _LOGBd std::logb -#define _LOGBf std::logb -// Rather than pulling in std::max from algorithm everytime, use available ::max. -#define _fmaxd max -#define _fmaxf max -#else -#ifdef __AMDGCN__ -#define _ISNANd __ocml_isnan_f64 -#define _ISNANf __ocml_isnan_f32 -#define _ISINFd __ocml_isinf_f64 -#define _ISINFf __ocml_isinf_f32 -#define _ISFINITEd __ocml_isfinite_f64 -#define _ISFINITEf __ocml_isfinite_f32 -#define _COPYSIGNd __ocml_copysign_f64 -#define _COPYSIGNf __ocml_copysign_f32 -#define _SCALBNd __ocml_scalbn_f64 -#define _SCALBNf __ocml_scalbn_f32 -#define _ABSd __ocml_fabs_f64 -#define _ABSf __ocml_fabs_f32 -#define _LOGBd __ocml_logb_f64 -#define _LOGBf __ocml_logb_f32 -#define _fmaxd __ocml_fmax_f64 -#define _fmaxf __ocml_fmax_f32 -#else -#define _ISNANd __nv_isnand -#define _ISNANf __nv_isnanf -#define _ISINFd __nv_isinfd -#define _ISINFf __nv_isinff -#define _ISFINITEd __nv_isfinited -#define _ISFINITEf __nv_finitef -#define _COPYSIGNd __nv_copysign -#define _COPYSIGNf __nv_copysignf +#ifdef __NVPTX__ +// FIXME: NVPTX should use generic builtins. #define _SCALBNd __nv_scalbn #define _SCALBNf __nv_scalbnf -#define _ABSd __nv_fabs -#define _ABSf __nv_fabsf #define _LOGBd __nv_logb #define _LOGBf __nv_logbf -#define _fmaxd __nv_fmax -#define _fmaxf __nv_fmaxf -#endif +#else +#define _SCALBNd __builtin_scalbn +#define _SCALBNf __builtin_scalbnf +#define _LOGBd __builtin_logb +#define _LOGBf __builtin_logbf #endif #if defined(__cplusplus) @@ -95,36 +49,36 @@ __DEVICE__ double _Complex __muldc3(double __a, double __b, double __c, double _Complex z; __real__(z) = __ac - __bd; __imag__(z) = __ad + __bc; - if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) { + if (__builtin_isnan(__real__(z)) && __builtin_isnan(__imag__(z))) { int __recalc = 0; - if (_ISINFd(__a) || _ISINFd(__b)) { - __a = _COPYSIGNd(_ISINFd(__a) ? 1 : 0, __a); - __b = _COPYSIGNd(_ISINFd(__b) ? 1 : 0, __b); - if (_ISNANd(__c)) - __c = _COPYSIGNd(0, __c); - if (_ISNANd(__d)) - __d = _COPYSIGNd(0, __d); + if (__builtin_isinf(__a) || __builtin_isinf(__b)) { + __a = __builtin_copysign(__builtin_isinf(__a) ? 1 : 0, __a); + __b = __builtin_copysign(__builtin_isinf(__b) ? 1 : 0, __b); + if (__builtin_isnan(__c)) + __c = __builtin_copysign(0, __c); + if (__builtin_isnan(__d)) + __d = __builtin_copysign(0, __d); __recalc = 1; } - if (_ISINFd(__c) || _ISINFd(__d)) { - __c = _COPYSIGNd(_ISINFd(__c) ? 1 : 0, __c); - __d = _COPYSIGNd(_ISINFd(__d) ? 1 : 0, __d); - if (_ISNANd(__a)) - __a = _COPYSIGNd(0, __a); - if (_ISNANd(__b)) - __b = _COPYSIGNd(0, __b); + if (__builtin_isinf(__c) || __builtin_isinf(__d)) { + __c = __builtin_copysign(__builtin_isinf(__c) ? 1 : 0, __c); + __d = __builtin_copysign(__builtin_isinf(__d) ? 1 : 0, __d); + if (__builtin_isnan(__a)) + __a = __builtin_copysign(0, __a); + if (__builtin_isnan(__b)) + __b = __builtin_copysign(0, __b); __recalc = 1; } - if (!__recalc && - (_ISINFd(__ac) || _ISINFd(__bd) || _ISINFd(__ad) || _ISINFd(__bc))) { - if (_ISNANd(__a)) - __a = _COPYSIGNd(0, __a); - if (_ISNANd(__b)) - __b = _COPYSIGNd(0, __b); - if (_ISNANd(__c)) - __c = _COPYSIGNd(0, __c); - if (_ISNANd(__d)) - __d = _COPYSIGNd(0, __d); + if (!__recalc && (__builtin_isinf(__ac) || __builtin_isinf(__bd) || + __builtin_isinf(__ad) || __builtin_isinf(__bc))) { + if (__builtin_isnan(__a)) + __a = __builtin_copysign(0, __a); + if (__builtin_isnan(__b)) + __b = __builtin_copysign(0, __b); + if (__builtin_isnan(__c)) + __c = __builtin_copysign(0, __c); + if (__builtin_isnan(__d)) + __d = __builtin_copysign(0, __d); __recalc = 1; } if (__recalc) { @@ -145,36 +99,36 @@ __DEVICE__ float _Complex __mulsc3(float __a, float __b, float __c, float __d) { float _Complex z; __real__(z) = __ac - __bd; __imag__(z) = __ad + __bc; - if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) { + if (__builtin_isnan(__real__(z)) && __builtin_isnan(__imag__(z))) { int __recalc = 0; - if (_ISINFf(__a) || _ISINFf(__b)) { - __a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a); - __b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b); - if (_ISNANf(__c)) - __c = _COPYSIGNf(0, __c); - if (_ISNANf(__d)) - __d = _COPYSIGNf(0, __d); + if (__builtin_isinf(__a) || __builtin_isinf(__b)) { + __a = __builtin_copysignf(__builtin_isinf(__a) ? 1 : 0, __a); + __b = __builtin_copysignf(__builtin_isinf(__b) ? 1 : 0, __b); + if (__builtin_isnan(__c)) + __c = __builtin_copysignf(0, __c); + if (__builtin_isnan(__d)) + __d = __builtin_copysignf(0, __d); __recalc = 1; } - if (_ISINFf(__c) || _ISINFf(__d)) { - __c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c); - __d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d); - if (_ISNANf(__a)) - __a = _COPYSIGNf(0, __a); - if (_ISNANf(__b)) - __b = _COPYSIGNf(0, __b); + if (__builtin_isinf(__c) || __builtin_isinf(__d)) { + __c = __builtin_copysignf(__builtin_isinf(__c) ? 1 : 0, __c); + __d = __builtin_copysignf(__builtin_isinf(__d) ? 1 : 0, __d); + if (__builtin_isnan(__a)) + __a = __builtin_copysignf(0, __a); + if (__builtin_isnan(__b)) + __b = __builtin_copysignf(0, __b); __recalc = 1; } - if (!__recalc && - (_ISINFf(__ac) || _ISINFf(__bd) || _ISINFf(__ad) || _ISINFf(__bc))) { - if (_ISNANf(__a)) - __a = _COPYSIGNf(0, __a); - if (_ISNANf(__b)) - __b = _COPYSIGNf(0, __b); - if (_ISNANf(__c)) - __c = _COPYSIGNf(0, __c); - if (_ISNANf(__d)) - __d = _COPYSIGNf(0, __d); + if (!__recalc && (__builtin_isinf(__ac) || __builtin_isinf(__bd) || + __builtin_isinf(__ad) || __builtin_isinf(__bc))) { + if (__builtin_isnan(__a)) + __a = __builtin_copysignf(0, __a); + if (__builtin_isnan(__b)) + __b = __builtin_copysignf(0, __b); + if (__builtin_isnan(__c)) + __c = __builtin_copysignf(0, __c); + if (__builtin_isnan(__d)) + __d = __builtin_copysignf(0, __d); __recalc = 1; } if (__recalc) { @@ -191,8 +145,9 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c, // Can't use std::max, because that's defined in <algorithm>, and we don't // want to pull that in for every compile. The CUDA headers define // ::max(float, float) and ::max(double, double), which is sufficient for us. - double __logbw = _LOGBd(_fmaxd(_ABSd(__c), _ABSd(__d))); - if (_ISFINITEd(__logbw)) { + double __logbw = + _LOGBd(__builtin_fmax(__builtin_fabs(__c), __builtin_fabs(__d))); + if (__builtin_isfinite(__logbw)) { __ilogbw = (int)__logbw; __c = _SCALBNd(__c, -__ilogbw); __d = _SCALBNd(__d, -__ilogbw); @@ -201,20 +156,20 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c, double _Complex z; __real__(z) = _SCALBNd((__a * __c + __b * __d) / __denom, -__ilogbw); __imag__(z) = _SCALBNd((__b * __c - __a * __d) / __denom, -__ilogbw); - if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) { - if ((__denom == 0.0) && (!_ISNANd(__a) || !_ISNANd(__b))) { - __real__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __a; - __imag__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __b; - } else if ((_ISINFd(__a) || _ISINFd(__b)) && _ISFINITEd(__c) && - _ISFINITEd(__d)) { - __a = _COPYSIGNd(_ISINFd(__a) ? 1.0 : 0.0, __a); - __b = _COPYSIGNd(_ISINFd(__b) ? 1.0 : 0.0, __b); + if (__builtin_isnan(__real__(z)) && __builtin_isnan(__imag__(z))) { + if ((__denom == 0.0) && (!__builtin_isnan(__a) || !__builtin_isnan(__b))) { + __real__(z) = __builtin_copysign(__builtin_huge_val(), __c) * __a; + __imag__(z) = __builtin_copysign(__builtin_huge_val(), __c) * __b; + } else if ((__builtin_isinf(__a) || __builtin_isinf(__b)) && + __builtin_isfinite(__c) && __builtin_isfinite(__d)) { + __a = __builtin_copysign(__builtin_isinf(__a) ? 1.0 : 0.0, __a); + __b = __builtin_copysign(__builtin_isinf(__b) ? 1.0 : 0.0, __b); __real__(z) = __builtin_huge_val() * (__a * __c + __b * __d); __imag__(z) = __builtin_huge_val() * (__b * __c - __a * __d); - } else if (_ISINFd(__logbw) && __logbw > 0.0 && _ISFINITEd(__a) && - _ISFINITEd(__b)) { - __c = _COPYSIGNd(_ISINFd(__c) ? 1.0 : 0.0, __c); - __d = _COPYSIGNd(_ISINFd(__d) ? 1.0 : 0.0, __d); + } else if (__builtin_isinf(__logbw) && __logbw > 0.0 && + __builtin_isfinite(__a) && __builtin_isfinite(__b)) { + __c = __builtin_copysign(__builtin_isinf(__c) ? 1.0 : 0.0, __c); + __d = __builtin_copysign(__builtin_isinf(__d) ? 1.0 : 0.0, __d); __real__(z) = 0.0 * (__a * __c + __b * __d); __imag__(z) = 0.0 * (__b * __c - __a * __d); } @@ -224,8 +179,9 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c, __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) { int __ilogbw = 0; - float __logbw = _LOGBf(_fmaxf(_ABSf(__c), _ABSf(__d))); - if (_ISFINITEf(__logbw)) { + float __logbw = + _LOGBf(__builtin_fmaxf(__builtin_fabsf(__c), __builtin_fabsf(__d))); + if (__builtin_isfinite(__logbw)) { __ilogbw = (int)__logbw; __c = _SCALBNf(__c, -__ilogbw); __d = _SCALBNf(__d, -__ilogbw); @@ -234,20 +190,20 @@ __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) { float _Complex z; __real__(z) = _SCALBNf((__a * __c + __b * __d) / __denom, -__ilogbw); __imag__(z) = _SCALBNf((__b * __c - __a * __d) / __denom, -__ilogbw); - if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) { - if ((__denom == 0) && (!_ISNANf(__a) || !_ISNANf(__b))) { - __real__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __a; - __imag__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __b; - } else if ((_ISINFf(__a) || _ISINFf(__b)) && _ISFINITEf(__c) && - _ISFINITEf(__d)) { - __a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a); - __b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b); + if (__builtin_isnan(__real__(z)) && __builtin_isnan(__imag__(z))) { + if ((__denom == 0) && (!__builtin_isnan(__a) || !__builtin_isnan(__b))) { + __real__(z) = __builtin_copysignf(__builtin_huge_valf(), __c) * __a; + __imag__(z) = __builtin_copysignf(__builtin_huge_valf(), __c) * __b; + } else if ((__builtin_isinf(__a) || __builtin_isinf(__b)) && + __builtin_isfinite(__c) && __builtin_isfinite(__d)) { + __a = __builtin_copysignf(__builtin_isinf(__a) ? 1 : 0, __a); + __b = __builtin_copysignf(__builtin_isinf(__b) ? 1 : 0, __b); __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d); __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d); - } else if (_ISINFf(__logbw) && __logbw > 0 && _ISFINITEf(__a) && - _ISFINITEf(__b)) { - __c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c); - __d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d); + } else if (__builtin_isinf(__logbw) && __logbw > 0 && + __builtin_isfinite(__a) && __builtin_isfinite(__b)) { + __c = __builtin_copysignf(__builtin_isinf(__c) ? 1 : 0, __c); + __d = __builtin_copysignf(__builtin_isinf(__d) ? 1 : 0, __d); __real__(z) = 0 * (__a * __c + __b * __d); __imag__(z) = 0 * (__b * __c - __a * __d); } @@ -259,22 +215,10 @@ __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) { } // extern "C" #endif -#undef _ISNANd -#undef _ISNANf -#undef _ISINFd -#undef _ISINFf -#undef _COPYSIGNd -#undef _COPYSIGNf -#undef _ISFINITEd -#undef _ISFINITEf #undef _SCALBNd #undef _SCALBNf -#undef _ABSd -#undef _ABSf #undef _LOGBd #undef _LOGBf -#undef _fmaxd -#undef _fmaxf #if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__) #pragma omp end declare target diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h index 8612372..0226fe9 100644 --- a/clang/lib/Headers/__clang_cuda_device_functions.h +++ b/clang/lib/Headers/__clang_cuda_device_functions.h @@ -528,7 +528,7 @@ __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); } __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); } __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); }; __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); }; -__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); } +__DEVICE__ __attribute__((noreturn)) void __trap(void) { __builtin_trap(); } __DEVICE__ unsigned short __usAtomicCAS(unsigned short *__p, unsigned short __cmp, unsigned short __v) { return __nvvm_atom_cas_gen_us(__p, __cmp, __v); diff --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h index fa8d918..fad9c6c 100644 --- a/clang/lib/Headers/__clang_hip_libdevice_declares.h +++ b/clang/lib/Headers/__clang_hip_libdevice_declares.h @@ -338,6 +338,23 @@ __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16); +__device__ void __asan_poison_memory_region(const void *addr, + __SIZE_TYPE__ size); +__device__ void __asan_unpoison_memory_region(const void *addr, + __SIZE_TYPE__ size); +__device__ int __asan_address_is_poisoned(const void *addr); +__device__ void *__asan_region_is_poisoned(void *beg, __SIZE_TYPE__ size); + +#if __has_feature(address_sanitizer) +#define ASAN_POISON_MEMORY_REGION(addr, size) \ + __asan_poison_memory_region((addr), (size)) +#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + __asan_unpoison_memory_region((addr), (size)) +#else +#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) +#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h index 759e742..03c2721 100644 --- a/clang/lib/Headers/__clang_hip_math.h +++ b/clang/lib/Headers/__clang_hip_math.h @@ -498,7 +498,7 @@ __DEVICE__ float log2f(float __x) { return __FAST_OR_SLOW(__log2f, __builtin_log2f)(__x); } __DEVICE__ -float logbf(float __x) { return __ocml_logb_f32(__x); } +float logbf(float __x) { return __builtin_logbf(__x); } __DEVICE__ float logf(float __x) { return __FAST_OR_SLOW(__logf, __builtin_logf)(__x); } @@ -901,7 +901,7 @@ __DEVICE__ double log2(double __x) { return __ocml_log2_f64(__x); } __DEVICE__ -double logb(double __x) { return __ocml_logb_f64(__x); } +double logb(double __x) { return __builtin_logb(__x); } __DEVICE__ long int lrint(double __x) { return __builtin_rint(__x); } diff --git a/clang/lib/Headers/amo.h b/clang/lib/Headers/amo.h new file mode 100644 index 0000000..62ee0e6 --- /dev/null +++ b/clang/lib/Headers/amo.h @@ -0,0 +1,97 @@ +/*===---- amo.h - PowerPC Atomic Memory Operations ------------------------===*\ + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * +\*===----------------------------------------------------------------------===*/ + +/* This header provides compatibility for GCC's AMO functions. + * The functions here call Clang's underlying AMO builtins. + */ + +#ifndef _AMO_H +#define _AMO_H + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* AMO Load Operation Codes (FC values) */ +enum { + _AMO_LD_ADD = 0x00, /* Fetch and Add */ + _AMO_LD_XOR = 0x01, /* Fetch and XOR */ + _AMO_LD_IOR = 0x02, /* Fetch and OR */ + _AMO_LD_AND = 0x03, /* Fetch and AND */ + _AMO_LD_UMAX = 0x04, /* Fetch and Maximum Unsigned */ + _AMO_LD_SMAX = 0x05, /* Fetch and Maximum Signed */ + _AMO_LD_UMIN = 0x06, /* Fetch and Minimum Unsigned */ + _AMO_LD_SMIN = 0x07, /* Fetch and Minimum Signed */ + _AMO_LD_SWAP = 0x08 /* Swap */ +}; + +/* 32-bit unsigned AMO load operations */ +static inline uint32_t amo_lwat_add(uint32_t *ptr, uint32_t val) { + return __builtin_amo_lwat(ptr, val, _AMO_LD_ADD); +} + +static inline uint32_t amo_lwat_xor(uint32_t *ptr, uint32_t val) { + return __builtin_amo_lwat(ptr, val, _AMO_LD_XOR); +} + +static inline uint32_t amo_lwat_ior(uint32_t *ptr, uint32_t val) { + return __builtin_amo_lwat(ptr, val, _AMO_LD_IOR); +} + +static inline uint32_t amo_lwat_and(uint32_t *ptr, uint32_t val) { + return __builtin_amo_lwat(ptr, val, _AMO_LD_AND); +} + +static inline uint32_t amo_lwat_umax(uint32_t *ptr, uint32_t val) { + return __builtin_amo_lwat(ptr, val, _AMO_LD_UMAX); +} + +static inline uint32_t amo_lwat_umin(uint32_t *ptr, uint32_t val) { + return __builtin_amo_lwat(ptr, val, _AMO_LD_UMIN); +} + +static inline uint32_t amo_lwat_swap(uint32_t *ptr, uint32_t val) { + return __builtin_amo_lwat(ptr, val, _AMO_LD_SWAP); +} + +/* 64-bit unsigned AMO load operations */ +static inline uint64_t amo_ldat_add(uint64_t *ptr, uint64_t val) { + return __builtin_amo_ldat(ptr, val, _AMO_LD_ADD); +} + +static inline uint64_t amo_ldat_xor(uint64_t *ptr, uint64_t val) { + return __builtin_amo_ldat(ptr, val, _AMO_LD_XOR); +} + +static inline uint64_t amo_ldat_ior(uint64_t *ptr, uint64_t val) { + return __builtin_amo_ldat(ptr, val, _AMO_LD_IOR); +} + +static inline uint64_t amo_ldat_and(uint64_t *ptr, uint64_t val) { + return __builtin_amo_ldat(ptr, val, _AMO_LD_AND); +} + +static inline uint64_t amo_ldat_umax(uint64_t *ptr, uint64_t val) { + return __builtin_amo_ldat(ptr, val, _AMO_LD_UMAX); +} + +static inline uint64_t amo_ldat_umin(uint64_t *ptr, uint64_t val) { + return __builtin_amo_ldat(ptr, val, _AMO_LD_UMIN); +} + +static inline uint64_t amo_ldat_swap(uint64_t *ptr, uint64_t val) { + return __builtin_amo_ldat(ptr, val, _AMO_LD_SWAP); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _AMO_H */ diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h index fcc2075..97f63e8 100644 --- a/clang/lib/Headers/arm_acle.h +++ b/clang/lib/Headers/arm_acle.h @@ -821,28 +821,6 @@ __arm_st64bv0(void *__addr, data512_t __value) { #endif // __ARM_FEATURE_COPROC -/* 17 Transactional Memory Extension (TME) Intrinsics */ -#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME - -#define _TMFAILURE_REASON 0x00007fffu -#define _TMFAILURE_RTRY 0x00008000u -#define _TMFAILURE_CNCL 0x00010000u -#define _TMFAILURE_MEM 0x00020000u -#define _TMFAILURE_IMP 0x00040000u -#define _TMFAILURE_ERR 0x00080000u -#define _TMFAILURE_SIZE 0x00100000u -#define _TMFAILURE_NEST 0x00200000u -#define _TMFAILURE_DBG 0x00400000u -#define _TMFAILURE_INT 0x00800000u -#define _TMFAILURE_TRIVIAL 0x01000000u - -#define __tstart() __builtin_arm_tstart() -#define __tcommit() __builtin_arm_tcommit() -#define __tcancel(__arg) __builtin_arm_tcancel(__arg) -#define __ttest() __builtin_arm_ttest() - -#endif /* __ARM_FEATURE_TME */ - /* 8.7 Armv8.5-A Random number generation intrinsics */ #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand"))) diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h index 37ebc4f..3e9f274 100644 --- a/clang/lib/Headers/avx10_2_512bf16intrin.h +++ b/clang/lib/Headers/avx10_2_512bf16intrin.h @@ -24,6 +24,12 @@ typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1))); __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \ __min_vector_width__(512))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr +#else +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 +#endif + static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) { return __builtin_bit_cast(__m512bh, _mm512_setzero_ps()); } @@ -167,13 +173,13 @@ _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) { (__v32bf)__A); } -static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) { return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); } -static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutexvar_pbh(__m512i __A, __m512bh __B) { return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); } @@ -423,7 +429,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) { (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U))) static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) { - return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A); + return __builtin_elementwise_sqrt(__A); } static __inline__ __m512bh __DEFAULT_FN_ATTRS512 @@ -555,6 +561,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh( (__v32bf)_mm512_setzero_pbh()); } +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR #undef __DEFAULT_FN_ATTRS512 #endif diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h index fdb57c7..b2215b7 100644 --- a/clang/lib/Headers/avx10_2_512niintrin.h +++ b/clang/lib/Headers/avx10_2_512niintrin.h @@ -185,8 +185,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v32hi)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -206,8 +206,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v32hi)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32( @@ -227,8 +227,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v32hu)__B, + (__v32hi)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -248,8 +248,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v32hu)__B, + (__v32hi)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32( @@ -269,8 +269,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v32hu)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -290,8 +290,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v32hu)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32( diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h index 765cd68..179ec53 100644 --- a/clang/lib/Headers/avx10_2bf16intrin.h +++ b/clang/lib/Headers/avx10_2bf16intrin.h @@ -27,6 +27,14 @@ typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1))); __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \ __min_vector_width__(128))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) { return __builtin_bit_cast(__m256bh, _mm256_setzero_ps()); } @@ -213,12 +221,12 @@ static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_move_sbh(__m128bh __a, return __a; } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), __W); } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) { return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), _mm_setzero_pbh()); @@ -287,24 +295,24 @@ _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) { (__v16bf)__A); } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) { return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi)__B); } -static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) { return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutexvar_pbh(__m128i __A, __m128bh __B) { return (__m128bh)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); } -static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutexvar_pbh(__m256i __A, __m256bh __B) { return (__m256bh)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); } @@ -818,7 +826,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) { (__v8bf)_mm_setzero_pbh(), (__mmask8)(__U))) static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) { - return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A); + return __builtin_elementwise_sqrt(__A); } static __inline__ __m256bh __DEFAULT_FN_ATTRS256 @@ -835,7 +843,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) { } static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) { - return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A); + return __builtin_elementwise_sqrt(__A); } static __inline__ __m128bh __DEFAULT_FN_ATTRS128 @@ -1080,6 +1088,7 @@ _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 - +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif #endif diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 3cbaaec..d3ceb23 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -2095,9 +2095,8 @@ _mm256_slli_epi16(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sll_epi16(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sll_epi16(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); } @@ -2134,9 +2133,8 @@ _mm256_slli_epi32(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sll_epi32(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sll_epi32(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); } @@ -2173,9 +2171,8 @@ _mm256_slli_epi64(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sll_epi64(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sll_epi64(__m256i __a, __m128i __count) { return __builtin_ia32_psllq256((__v4di)__a, __count); } @@ -2214,9 +2211,8 @@ _mm256_srai_epi16(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sra_epi16(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sra_epi16(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); } @@ -2255,9 +2251,8 @@ _mm256_srai_epi32(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sra_epi32(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sra_epi32(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); } @@ -2336,9 +2331,8 @@ _mm256_srli_epi16(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srl_epi16(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srl_epi16(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); } @@ -2375,9 +2369,8 @@ _mm256_srli_epi32(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srl_epi32(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srl_epi32(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); } @@ -2414,9 +2407,8 @@ _mm256_srli_epi64(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srl_epi64(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srl_epi64(__m256i __a, __m128i __count) { return __builtin_ia32_psrlq256((__v4di)__a, __count); } @@ -3214,9 +3206,8 @@ _mm_broadcastq_epi64(__m128i __X) { /// A 256-bit vector of [8 x i32] containing indexes of values to use from /// \a __a. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); } @@ -3272,9 +3263,8 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) /// A 256-bit vector of [8 x i32] containing indexes of values to use from /// \a __a. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) { return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); } diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h index 3973f0e..458d1f8 100644 --- a/clang/lib/Headers/avx512bf16intrin.h +++ b/clang/lib/Headers/avx512bf16intrin.h @@ -25,6 +25,14 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead"))); #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + /// Convert One BF16 Data to One Single Float Data. /// /// \headerfile <x86intrin.h> @@ -35,8 +43,8 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead"))); /// A bfloat data. /// \returns A float data whose sign field and exponent field keep unchanged, /// and fraction field is extended to 23 bits. -static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) { - return __builtin_ia32_cvtsbf162ss_32(__A); +static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsbh_ss(__bf16 __A) { + return (float)(__A); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -235,9 +243,9 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) { /// \param __A /// A 256-bit vector of [16 x bfloat]. /// \returns A 512-bit vector of [16 x float] come from conversion of __A -static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( - (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtpbh_ps(__m256bh __A) { + return (__m512) __builtin_convertvector(__A, __v16sf); } /// Convert Packed BF16 Data to Packed float Data using zeroing mask. @@ -250,10 +258,11 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { /// \param __A /// A 256-bit vector of [16 x bfloat]. /// \returns A 512-bit vector of [16 x float] come from conversion of __A -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( - (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16)); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_cvtpbh_ps(__A), + (__v16sf)_mm512_setzero_ps()); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -268,15 +277,16 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { /// \param __A /// A 256-bit vector of [16 x bfloat]. /// \returns A 512-bit vector of [16 x float] come from conversion of __A -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32( - (__m512i)__S, (__mmask16)__U, - (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); + return (__m512)__builtin_ia32_selectps_512( + (__mmask16)__U, (__v16sf)_mm512_cvtpbh_ps(__A), (__v16sf)__S); } #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #undef __DEFAULT_FN_ATTRS512 +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR #endif #endif diff --git a/clang/lib/Headers/avx512bitalgintrin.h b/clang/lib/Headers/avx512bitalgintrin.h index 98197e4..f5e9b1a 100644 --- a/clang/lib/Headers/avx512bitalgintrin.h +++ b/clang/lib/Headers/avx512bitalgintrin.h @@ -15,44 +15,42 @@ #define __AVX512BITALGINTRIN_H /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), \ - __min_vector_width__(512))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr + __min_vector_width__(512))) constexpr #else -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), \ + __min_vector_width__(512))) #endif -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_popcnt_epi16(__m512i __A) { +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi16(__m512i __A) { return (__m512i)__builtin_elementwise_popcount((__v32hu)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512( (__mmask32)__U, (__v32hi)_mm512_popcnt_epi16(__B), (__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) { return _mm512_mask_popcnt_epi16((__m512i)_mm512_setzero_si512(), __U, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_popcnt_epi8(__m512i __A) { +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi8(__m512i __A) { return (__m512i)__builtin_elementwise_popcount((__v64qu)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512( (__mmask64)__U, (__v64qi)_mm512_popcnt_epi8(__B), (__v64qi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) { return _mm512_mask_popcnt_epi8((__m512i)_mm512_setzero_si512(), __U, __B); } @@ -74,6 +72,4 @@ _mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) } #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_CONSTEXPR - #endif diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index ac75b6c..48b7c98 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -92,69 +92,65 @@ _kxor_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) { return (unsigned char)__builtin_ia32_kortestcsi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) { return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) { *__C = (unsigned char)__builtin_ia32_kortestcsi(__A, __B); return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) { return (unsigned char)__builtin_ia32_kortestcdi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) { return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) { *__C = (unsigned char)__builtin_ia32_kortestcdi(__A, __B); return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) { return (unsigned char)__builtin_ia32_ktestcsi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) { return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) { *__C = (unsigned char)__builtin_ia32_ktestcsi(__A, __B); return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) { return (unsigned char)__builtin_ia32_ktestcdi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) { return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) { *__C = (unsigned char)__builtin_ia32_ktestcdi(__A, __B); return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); @@ -182,22 +178,22 @@ _kadd_mask64(__mmask64 __A, __mmask64 __B) { #define _kshiftri_mask64(A, I) \ ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I))) -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_cvtmask32_u32(__mmask32 __A) { +static __inline__ unsigned int + __DEFAULT_FN_ATTRS_CONSTEXPR _cvtmask32_u32(__mmask32 __A) { return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A); } -static __inline__ unsigned long long __DEFAULT_FN_ATTRS +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR _cvtmask64_u64(__mmask64 __A) { return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS_CONSTEXPR _cvtu32_mask32(unsigned int __A) { return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS +static __inline__ __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR _cvtu64_mask64(unsigned long long __A) { return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A); } @@ -515,7 +511,7 @@ _mm512_packs_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -523,9 +519,8 @@ _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, (__v32hi)_mm512_packs_epi32(__A, __B), (__v32hi)__W); @@ -536,7 +531,7 @@ _mm512_packs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -544,7 +539,7 @@ _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -557,7 +552,7 @@ _mm512_packus_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -565,7 +560,7 @@ _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -578,7 +573,7 @@ _mm512_packus_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -586,7 +581,7 @@ _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -599,17 +594,15 @@ _mm512_adds_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_adds_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_adds_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); @@ -620,7 +613,7 @@ _mm512_adds_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -628,7 +621,7 @@ _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -641,7 +634,7 @@ _mm512_adds_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -649,7 +642,7 @@ _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -662,17 +655,15 @@ _mm512_adds_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_adds_epu16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_adds_epu16(__A, __B), (__v32hi)_mm512_setzero_si512()); @@ -890,7 +881,7 @@ _mm512_subs_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -898,7 +889,7 @@ _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -911,7 +902,7 @@ _mm512_subs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -919,7 +910,7 @@ _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -932,7 +923,7 @@ _mm512_subs_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -940,7 +931,7 @@ _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -953,7 +944,7 @@ _mm512_subs_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -961,7 +952,7 @@ _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -969,35 +960,31 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)_mm512_setzero_si512()); @@ -1199,14 +1186,14 @@ _mm512_unpackhi_epi8(__m512i __A, __m512i __B) { 62, 64+62, 63, 64+63); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpackhi_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpackhi_epi8(__A, __B), @@ -1226,14 +1213,14 @@ _mm512_unpackhi_epi16(__m512i __A, __m512i __B) { 30, 32+30, 31, 32+31); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpackhi_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpackhi_epi16(__A, __B), @@ -1261,14 +1248,14 @@ _mm512_unpacklo_epi8(__m512i __A, __m512i __B) { 54, 64+54, 55, 64+55); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpacklo_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpacklo_epi8(__A, __B), @@ -1288,14 +1275,14 @@ _mm512_unpacklo_epi16(__m512i __A, __m512i __B) { 26, 32+26, 27, 32+27); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpacklo_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpacklo_epi16(__A, __B), @@ -1396,23 +1383,20 @@ _mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sll_epi16(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sll_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sll_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sll_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); @@ -1486,23 +1470,20 @@ _mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sra_epi16(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sra_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sra_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sra_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); @@ -1528,23 +1509,20 @@ _mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B) { (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srl_epi16(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srl_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srl_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srl_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); @@ -1574,7 +1552,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) { ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v64qi)(__m512i)(a), \ (int)(imm))) -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, @@ -1582,23 +1560,21 @@ _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) (__v32hi) __W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, (__v32hi) __A, (__v32hi) _mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, (__v64qi) __A, (__v64qi) __W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, @@ -1606,7 +1582,7 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) (__v64qi) _mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) { return (__m512i) __builtin_ia32_selectb_512(__M, @@ -1614,23 +1590,21 @@ _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) (__v64qi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_set1_epi8 (__mmask64 __M, char __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_set1_epi8(__mmask64 __M, char __A) { return (__m512i) __builtin_ia32_selectb_512(__M, (__v64qi) _mm512_set1_epi8(__A), (__v64qi) _mm512_setzero_si512()); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd(__mmask64 __A, - __mmask64 __B) { +static __inline__ __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_kunpackd(__mmask64 __A, __mmask64 __B) { return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, (__mmask64) __B); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_kunpackw (__mmask32 __A, __mmask32 __B) -{ +static __inline__ __mmask32 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_kunpackw(__mmask32 __A, __mmask32 __B) { return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, (__mmask32) __B); } @@ -1776,15 +1750,13 @@ _mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) _mm512_setzero_si512()); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 -_mm512_movepi8_mask (__m512i __A) -{ +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_movepi8_mask(__m512i __A) { return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 -_mm512_movepi16_mask (__m512i __A) -{ +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_movepi16_mask(__m512i __A) { return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A); } @@ -1809,7 +1781,7 @@ _mm512_broadcastb_epi8(__m128i __A) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectb_512(__M, @@ -1817,15 +1789,14 @@ _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) (__v64qi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectb_512(__M, (__v64qi) _mm512_broadcastb_epi8(__A), (__v64qi) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) { return (__m512i) __builtin_ia32_selectw_512(__M, @@ -1833,9 +1804,8 @@ _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) (__v32hi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_set1_epi16 (__mmask32 __M, short __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_set1_epi16(__mmask32 __M, short __A) { return (__m512i) __builtin_ia32_selectw_512(__M, (__v32hi) _mm512_set1_epi16(__A), (__v32hi) _mm512_setzero_si512()); @@ -1848,7 +1818,7 @@ _mm512_broadcastw_epi16(__m128i __A) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectw_512(__M, @@ -1856,7 +1826,7 @@ _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) (__v32hi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectw_512(__M, @@ -1864,25 +1834,21 @@ _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) (__v32hi) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_epi16 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutexvar_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, (__v32hi)_mm512_permutexvar_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, (__v32hi)_mm512_permutexvar_epi16(__A, __B), (__v32hi)__W); diff --git a/clang/lib/Headers/avx512cdintrin.h b/clang/lib/Headers/avx512cdintrin.h index fb6dcb6..f9de207 100644 --- a/clang/lib/Headers/avx512cdintrin.h +++ b/clang/lib/Headers/avx512cdintrin.h @@ -17,8 +17,8 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512cd"), __min_vector_width__(512))) + __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), \ + __min_vector_width__(512))) constexpr #else #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), \ diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h index fef1a2d..ae02cdd 100644 --- a/clang/lib/Headers/avx512dqintrin.h +++ b/clang/lib/Headers/avx512dqintrin.h @@ -59,55 +59,49 @@ _kxor_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) { return (unsigned char)__builtin_ia32_kortestcqi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) { return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) { *__C = (unsigned char)__builtin_ia32_kortestcqi(__A, __B); return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) { return (unsigned char)__builtin_ia32_ktestcqi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) { return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) { *__C = (unsigned char)__builtin_ia32_ktestcqi(__A, __B); return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) { return (unsigned char)__builtin_ia32_ktestchi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) { return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { *__C = (unsigned char)__builtin_ia32_ktestchi(__A, __B); return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); @@ -129,12 +123,12 @@ _kadd_mask16(__mmask16 __A, __mmask16 __B) { #define _kshiftri_mask8(A, I) \ ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I))) -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_cvtmask8_u32(__mmask8 __A) { +static __inline__ unsigned int + __DEFAULT_FN_ATTRS_CONSTEXPR _cvtmask8_u32(__mmask8 __A) { return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS_CONSTEXPR _cvtu32_mask8(unsigned int __A) { return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A); } @@ -1052,9 +1046,8 @@ _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(C), (int)(R))) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 -_mm512_movepi32_mask (__m512i __A) -{ +static __inline__ __mmask16 + __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_movepi32_mask(__m512i __A) { return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A); } @@ -1070,9 +1063,8 @@ _mm512_movm_epi64 (__mmask8 __A) return (__m512i) __builtin_ia32_cvtmask2q512 (__A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 -_mm512_movepi64_mask (__m512i __A) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_movepi64_mask(__m512i __A) { return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A); } diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 18c4a44..02282cb 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -207,9 +207,7 @@ _mm512_undefined(void) return (__m512)__builtin_ia32_undef512(); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_undefined_ps(void) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_undefined_ps(void) { return (__m512)__builtin_ia32_undef512(); } @@ -1369,17 +1367,15 @@ _mm512_mul_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epi32(__X, __Y), (__v8di)__W); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epi32(__X, __Y), (__v8di)_mm512_setzero_si512 ()); @@ -1390,17 +1386,15 @@ _mm512_mul_epu32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epu32(__X, __Y), (__v8di)__W); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_mul_epu32(__X, __Y), (__v8di)_mm512_setzero_si512 ()); @@ -1450,26 +1444,19 @@ _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { (__v8df)_mm512_sqrt_round_pd((A), (R)), \ (__v8df)_mm512_setzero_pd())) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_sqrt_pd(__m512d __A) -{ - return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A, - _MM_FROUND_CUR_DIRECTION); +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_sqrt_pd(__m512d __A) { + return (__m512d)__builtin_elementwise_sqrt((__v8df)__A); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_sqrt_pd(__A), +_mm512_mask_sqrt_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_sqrt_pd(__A), (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_sqrt_pd(__A), +_mm512_maskz_sqrt_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_sqrt_pd(__A), (__v8df)_mm512_setzero_pd()); } @@ -1486,26 +1473,19 @@ _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_sqrt_ps(__m512 __A) -{ - return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A, - _MM_FROUND_CUR_DIRECTION); +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_sqrt_ps(__m512 __A) { + return (__m512)__builtin_elementwise_sqrt((__v16sf)__A); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_sqrt_ps(__A), +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_sqrt_ps(__A), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_sqrt_ps(__A), +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_sqrt_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_sqrt_ps(__A), (__v16sf)_mm512_setzero_ps()); } @@ -1834,14 +1814,14 @@ _mm512_maskz_abs_epi32(__mmask16 __U, __m512i __A) { (__v16si)_mm512_setzero_si512()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_add_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_add_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -1864,14 +1844,14 @@ _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_add_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_add_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -1893,28 +1873,28 @@ _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_add_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_add_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_add_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_add_ps(__A, __B), @@ -1949,14 +1929,14 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_sub_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_sub_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -1978,14 +1958,14 @@ _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_sub_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_sub_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -2008,28 +1988,28 @@ _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_sub_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_sub_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_sub_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_sub_ps(__A, __B), @@ -2064,14 +2044,14 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_mul_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_mul_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -2093,14 +2073,14 @@ _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_mul_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_mul_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -2123,28 +2103,28 @@ _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_mul_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_mul_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_mul_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_mul_ps(__A, __B), @@ -2179,14 +2159,14 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_div_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { __A = _mm_div_ss(__A, __B); return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } @@ -2209,14 +2189,14 @@ _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_div_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { __A = _mm_div_sd(__A, __B); return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } @@ -2244,14 +2224,14 @@ static __inline __m512d return (__m512d)((__v8df)__a/(__v8df)__b); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_div_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_div_pd(__A, __B), @@ -2263,14 +2243,14 @@ _mm512_div_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a/(__v16sf)__b); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_div_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_div_ps(__A, __B), @@ -3059,69 +3039,61 @@ _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) /* Vector permutations */ -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, (__v16si) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, (__v8di) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)_mm512_setzero_si512()); @@ -3515,44 +3487,38 @@ _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U, __m512i __A) { (__v8sf)_mm256_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_cvtpd_ps (__m512d __A) -{ - return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, - (__v8sf) _mm256_undefined_ps (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); +static __inline__ __m256 + __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_cvtpd_ps(__m512d __A) { + return (__m256)__builtin_ia32_cvtpd2ps512_mask( + (__v8df)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_cvtpd_ps(__m256 __W, __mmask8 __U, __m512d __A) { return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, (__v8sf) __W, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_cvtpd_ps(__mmask8 __U, __m512d __A) { return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, (__v8sf) _mm256_setzero_ps (), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_cvtpd_pslo (__m512d __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_cvtpd_pslo(__m512d __A) { return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A), (__v8sf) _mm256_setzero_ps (), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_cvtpd_pslo(__m512 __W, __mmask8 __U, __m512d __A) { return (__m512) __builtin_shufflevector ( (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W), __U, __A), @@ -5408,45 +5374,39 @@ _mm512_kmov (__mmask16 __A) ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) #endif -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sll_epi32(__m512i __A, __m128i __B) -{ +static __inline__ __m512i + __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_sll_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_sll_epi32(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_sll_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sll_epi64(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sll_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_sll_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_sll_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); @@ -5493,45 +5453,39 @@ _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sra_epi32(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sra_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_sra_epi32(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_sra_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sra_epi64(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sra_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_sra_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_sra_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); @@ -5578,45 +5532,39 @@ _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srl_epi32(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srl_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_srl_epi32(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_srl_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srl_epi64(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srl_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_srl_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_srl_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); @@ -5905,115 +5853,104 @@ _mm_cvttss_u64 (__m128 __A) (__v16sf)_mm512_permute_ps((X), (C)), \ (__v16sf)_mm512_setzero_ps())) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_permutevar_pd(__m512d __A, __m512i __C) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutevar_pd(__m512d __A, __m512i __C) { return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_permutevar_pd(__A, __C), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_permutevar_pd(__A, __C), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_permutevar_ps(__m512 __A, __m512i __C) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutevar_ps(__m512 __A, __m512i __C) { return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_permutevar_ps(__A, __C), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_permutevar_ps(__A, __C), (__v16sf)_mm512_setzero_ps()); } -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) { return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, (__v8df)__B); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)__A); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, - __m512d __B) -{ + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)(__m512d)__I); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, - __m512d __B) -{ + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)_mm512_setzero_pd()); } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) { return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, (__v16sf) __B); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)__A); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)(__m512)__I); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)_mm512_setzero_ps()); } - #define _mm512_cvtt_roundpd_epu32(A, R) \ ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_undefined_si256(), \ @@ -7972,93 +7909,82 @@ _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) (__v8di)_mm512_permutex_epi64((X), (C)), \ (__v8di)_mm512_setzero_si512())) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_pd (__m512i __X, __m512d __Y) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutexvar_pd(__m512i __X, __m512d __Y) { return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, __m512i __X, + __m512d __Y) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_permutexvar_pd(__X, __Y), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, __m512d __Y) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_permutexvar_pd(__X, __Y), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutexvar_epi64(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_permutexvar_epi64(__X, __Y), (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_permutexvar_epi64(__X, __Y), (__v8di)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_ps (__m512i __X, __m512 __Y) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutexvar_ps(__m512i __X, __m512 __Y) { return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_permutexvar_ps(__X, __Y), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, __m512 __Y) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_permutexvar_ps(__X, __Y), (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutexvar_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X); } #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32 -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_permutexvar_epi32(__X, __Y), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, - __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_permutexvar_epi32(__X, __Y), (__v16si)__W); @@ -8081,39 +8007,34 @@ _mm512_kor(__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ int __DEFAULT_FN_ATTRS -_mm512_kortestc (__mmask16 __A, __mmask16 __B) -{ +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_kortestc(__mmask16 __A, __mmask16 __B) { return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ int __DEFAULT_FN_ATTRS -_mm512_kortestz (__mmask16 __A, __mmask16 __B) -{ +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_kortestz(__mmask16 __A, __mmask16 __B) { return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) { return (unsigned char)__builtin_ia32_kortestchi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) -{ +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) { return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); } -static __inline__ unsigned char __DEFAULT_FN_ATTRS +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B); return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kunpackb (__mmask16 __A, __mmask16 __B) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_kunpackb(__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } @@ -8140,12 +8061,12 @@ _mm512_kxor(__mmask16 __A, __mmask16 __B) { #define _kshiftri_mask16(A, I) \ ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I))) -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_cvtmask16_u32(__mmask16 __A) { +static __inline__ unsigned int + __DEFAULT_FN_ATTRS_CONSTEXPR _cvtmask16_u32(__mmask16 __A) { return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR _cvtu32_mask16(unsigned int __A) { return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); } @@ -8725,18 +8646,16 @@ _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_cvtsd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) { return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, (__v2df)__B, (__v4sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_cvtsd_ss(__mmask8 __U, __m128 __A, __m128d __B) { return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, (__v2df)__B, (__v4sf)_mm_setzero_ps(), diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h index 142cc07..4c6bf3a 100644 --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -588,23 +588,20 @@ _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) { (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_add_sh(__m128h __A, __m128h __B) { __A[0] += __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_add_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_add_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -624,23 +621,20 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h + __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_sub_sh(__m128h __A, __m128h __B) { __A[0] -= __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_sub_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_sub_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -660,23 +654,20 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h + __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mul_sh(__m128h __A, __m128h __B) { __A[0] *= __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_mul_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_mul_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -696,23 +687,20 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ (__mmask8)(U), (int)(R))) -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A, - __m128h __B) { +static __inline__ __m128h + __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_div_sh(__m128h __A, __m128h __B) { __A[0] /= __B[0]; return __A; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_div_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) { __A = _mm_div_sh(__A, __B); return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); } @@ -960,22 +948,19 @@ static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P, } // moves with vmovsh: -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a, - __m128h __b) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_move_sh(__m128h __a, __m128h __b) { __a[0] = __b[0]; return __a; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) { return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), _mm_setzero_ph()); } @@ -1401,24 +1386,20 @@ _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) { (__v32hf)_mm512_setzero_ph())) static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) { - return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A, - _MM_FROUND_CUR_DIRECTION); + return (__m512h)__builtin_elementwise_sqrt((__v32hf)__A); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)(__U), - (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), - (__v32hf)(__m512h)(__W)); + (__mmask32)(__U), (__v32hf)_mm512_sqrt_ph(__A), (__v32hf)(__m512h)(__W)); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)(__U), - (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), - (__v32hf)_mm512_setzero_ph()); + return (__m512h)__builtin_ia32_selectph_512((__mmask32)(__U), + (__v32hf)_mm512_sqrt_ph(__A), + (__v32hf)_mm512_setzero_ph()); } #define _mm_sqrt_round_sh(A, B, R) \ @@ -3316,13 +3297,13 @@ _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { (__v32hf)__A); } -static __inline__ __m512h __DEFAULT_FN_ATTRS512 +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) { return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); } -static __inline__ __m512h __DEFAULT_FN_ATTRS512 +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutexvar_ph(__m512i __A, __m512h __B) { return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); } diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h index 625a8ff..f73b607 100644 --- a/clang/lib/Headers/avx512ifmaintrin.h +++ b/clang/lib/Headers/avx512ifmaintrin.h @@ -17,9 +17,8 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS \ - constexpr \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \ - __min_vector_width__(512))) + __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \ + __min_vector_width__(512))) constexpr #else #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \ diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h index b377c17..51d5210 100644 --- a/clang/lib/Headers/avx512ifmavlintrin.h +++ b/clang/lib/Headers/avx512ifmavlintrin.h @@ -18,13 +18,13 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS128 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512ifma,avx512vl"), \ - __min_vector_width__(128))) + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512ifma,avx512vl"), \ + __min_vector_width__(128))) constexpr #define __DEFAULT_FN_ATTRS256 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512ifma,avx512vl"), \ - __min_vector_width__(256))) + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512ifma,avx512vl"), \ + __min_vector_width__(256))) constexpr #else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ @@ -34,7 +34,6 @@ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512ifma,avx512vl"), \ __min_vector_width__(256))) - #endif #if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__)) diff --git a/clang/lib/Headers/avx512vbmiintrin.h b/clang/lib/Headers/avx512vbmiintrin.h index 964535c..5ac78f08 100644 --- a/clang/lib/Headers/avx512vbmiintrin.h +++ b/clang/lib/Headers/avx512vbmiintrin.h @@ -15,63 +15,57 @@ #define __VBMIINTRIN_H /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), \ + __min_vector_width__(512))) constexpr +#else #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), \ __min_vector_width__(512))) +#endif static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) -{ +_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I, (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_permutex2var_epi8( + __m512i __A, __mmask64 __U, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask2_permutex2var_epi8( + __m512i __A, __m512i __I, __mmask64 __U, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_permutex2var_epi8( + __mmask64 __U, __m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutexvar_epi8 (__m512i __A, __m512i __B) -{ +_mm512_permutexvar_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A, - __m512i __B) -{ +_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_permutexvar_epi8( + __m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)__W); @@ -100,7 +94,5 @@ _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) (__v64qi)_mm512_setzero_si512()); } - #undef __DEFAULT_FN_ATTRS - #endif diff --git a/clang/lib/Headers/avx512vbmivlintrin.h b/clang/lib/Headers/avx512vbmivlintrin.h index 4c50be7..40a67bd 100644 --- a/clang/lib/Headers/avx512vbmivlintrin.h +++ b/clang/lib/Headers/avx512vbmivlintrin.h @@ -15,6 +15,16 @@ #define __VBMIVLINTRIN_H /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vbmi,avx512vl"), \ + __min_vector_width__(128))) constexpr +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vbmi,avx512vl"), \ + __min_vector_width__(256))) constexpr +#else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vbmi,avx512vl"), \ @@ -23,118 +33,96 @@ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vbmi,avx512vl"), \ __min_vector_width__(256))) +#endif static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) -{ +_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A, (__v16qi)__I, (__v16qi)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutex2var_epi8( + __m128i __A, __mmask16 __U, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask2_permutex2var_epi8( + __m128i __A, __m128i __I, __mmask16 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutex2var_epi8( + __mmask16 __U, __m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) -{ +_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I, (__v32qi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutex2var_epi8( + __m256i __A, __mmask32 __U, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask2_permutex2var_epi8( + __m256i __A, __m256i __I, __mmask32 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutex2var_epi8( + __mmask32 __U, __m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutexvar_epi8 (__m128i __A, __m128i __B) -{ +_mm_permutexvar_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B) -{ +_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutexvar_epi8( + __m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi8 (__m256i __A, __m256i __B) -{ +_mm256_permutexvar_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A, - __m256i __B) -{ +_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_epi8( + __m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)__W); @@ -186,8 +174,6 @@ _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) (__v32qi)_mm256_setzero_si256()); } - #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 - #endif diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h index 2d7ea01..8543402 100644 --- a/clang/lib/Headers/avx512vlbf16intrin.h +++ b/clang/lib/Headers/avx512vlbf16intrin.h @@ -24,6 +24,14 @@ __target__("avx512vl,avx512bf16"), \ __min_vector_width__(256))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + /// Convert Two Packed Single Data to One Packed BF16 Data. /// /// \headerfile <x86intrin.h> @@ -421,9 +429,10 @@ static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) { /// \param __A /// A 128-bit vector of [4 x bfloat]. /// \returns A 128-bit vector of [4 x float] come from conversion of __A -static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) { - return _mm_castsi128_ps( - (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16)); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_cvtpbh_ps(__m128bh __A) { + return (__m128)_mm256_castps256_ps128( + (__m256) __builtin_convertvector(__A, __v8sf)); } /// Convert Packed BF16 Data to Packed float Data. @@ -433,9 +442,9 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) { /// \param __A /// A 128-bit vector of [8 x bfloat]. /// \returns A 256-bit vector of [8 x float] come from conversion of __A -static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( - (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16)); +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_cvtpbh_ps(__m128bh __A) { + return (__m256) __builtin_convertvector(__A, __v8sf); } /// Convert Packed BF16 Data to Packed float Data using zeroing mask. @@ -448,10 +457,10 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { /// \param __A /// A 128-bit vector of [4 x bfloat]. /// \returns A 128-bit vector of [4 x float] come from conversion of __A -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { - return _mm_castsi128_ps((__m128i)_mm_slli_epi32( - (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)_mm_setzero_ps()); } /// Convert Packed BF16 Data to Packed float Data using zeroing mask. @@ -464,10 +473,11 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { /// \param __A /// A 128-bit vector of [8 x bfloat]. /// \returns A 256-bit vector of [8 x float] come from conversion of __A -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( - (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_cvtpbh_ps(__A), + (__v8sf)_mm256_setzero_ps()); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -483,11 +493,10 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { /// \param __A /// A 128-bit vector of [4 x bfloat]. /// \returns A 128-bit vector of [4 x float] come from conversion of __A -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { - return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32( - (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A), - 16)); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)__S); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -503,15 +512,16 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { /// \param __A /// A 128-bit vector of [8 x bfloat]. /// \returns A 256-bit vector of [8 x float] come from conversion of __A -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32( - (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), - 16)); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, (__v8sf)_mm256_cvtpbh_ps(__A), (__v8sf)__S); } #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif #endif diff --git a/clang/lib/Headers/avx512vlbitalgintrin.h b/clang/lib/Headers/avx512vlbitalgintrin.h index 1874adc..edfb9c1 100644 --- a/clang/lib/Headers/avx512vlbitalgintrin.h +++ b/clang/lib/Headers/avx512vlbitalgintrin.h @@ -15,6 +15,16 @@ #define __AVX512VLBITALGINTRIN_H /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vl,avx512bitalg"), \ + __min_vector_width__(128))) constexpr +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vl,avx512bitalg"), \ + __min_vector_width__(256))) constexpr +#else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vl,avx512bitalg"), \ @@ -23,75 +33,66 @@ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vl,avx512bitalg"), \ __min_vector_width__(256))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr -#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr -#else -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 -#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 #endif -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_popcnt_epi16(__m256i __A) { return (__m256i)__builtin_elementwise_popcount((__v16hu)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256( (__mmask16)__U, (__v16hi)_mm256_popcnt_epi16(__B), (__v16hi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) { return _mm256_mask_popcnt_epi16((__m256i)_mm256_setzero_si256(), __U, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_popcnt_epi16(__m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_popcnt_epi16(__m128i __A) { return (__m128i)__builtin_elementwise_popcount((__v8hu)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128( (__mmask8)__U, (__v8hi)_mm_popcnt_epi16(__B), (__v8hi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) { return _mm_mask_popcnt_epi16((__m128i)_mm_setzero_si128(), __U, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_popcnt_epi8(__m256i __A) { return (__m256i)__builtin_elementwise_popcount((__v32qu)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256( (__mmask32)__U, (__v32qi)_mm256_popcnt_epi8(__B), (__v32qi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) { return _mm256_mask_popcnt_epi8((__m256i)_mm256_setzero_si256(), __U, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_popcnt_epi8(__m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_popcnt_epi8(__m128i __A) { return (__m128i)__builtin_elementwise_popcount((__v16qu)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128( (__mmask16)__U, (__v16qi)_mm_popcnt_epi8(__B), (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) { return _mm_mask_popcnt_epi8((__m128i)_mm_setzero_si128(), __U, __B); } @@ -131,7 +132,4 @@ _mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 -#undef __DEFAULT_FN_ATTRS128_CONSTEXPR -#undef __DEFAULT_FN_ATTRS256_CONSTEXPR - #endif diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 263a107..a7c1e1c 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -536,14 +536,14 @@ _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, (__v8hi)_mm_packs_epi32(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -551,7 +551,7 @@ _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -559,7 +559,7 @@ _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -567,7 +567,7 @@ _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -575,7 +575,7 @@ _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -583,7 +583,7 @@ _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -591,7 +591,7 @@ _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -599,7 +599,7 @@ _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -607,7 +607,7 @@ _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -615,7 +615,7 @@ _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -623,7 +623,7 @@ _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -631,7 +631,7 @@ _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -639,7 +639,7 @@ _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -647,7 +647,7 @@ _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -655,7 +655,7 @@ _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -663,7 +663,7 @@ _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -671,7 +671,7 @@ _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -679,7 +679,7 @@ _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -687,7 +687,7 @@ _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -695,7 +695,7 @@ _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -703,7 +703,7 @@ _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -711,7 +711,7 @@ _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -719,7 +719,7 @@ _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -727,7 +727,7 @@ _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -735,7 +735,7 @@ _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -743,7 +743,7 @@ _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -751,7 +751,7 @@ _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -759,7 +759,7 @@ _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -767,7 +767,7 @@ _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -775,7 +775,7 @@ _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -783,7 +783,7 @@ _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1095,7 +1095,7 @@ _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) { (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1103,7 +1103,7 @@ _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1111,7 +1111,7 @@ _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1119,7 +1119,7 @@ _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1127,7 +1127,7 @@ _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1135,7 +1135,7 @@ _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1143,7 +1143,7 @@ _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1151,7 +1151,7 @@ _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1159,7 +1159,7 @@ _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1167,7 +1167,7 @@ _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1175,7 +1175,7 @@ _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1183,7 +1183,7 @@ _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1191,7 +1191,7 @@ _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1199,7 +1199,7 @@ _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1207,7 +1207,7 @@ _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1215,7 +1215,7 @@ _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1223,69 +1223,61 @@ _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi) __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I, + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)_mm256_setzero_si256()); @@ -1440,14 +1432,14 @@ _mm_cvtepi16_epi8(__m128i __A) { 12, 13, 14, 15); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, (__v16qi) _mm_setzero_si128(), @@ -1596,112 +1588,112 @@ _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { (__mmask16)__U, (__v16qi)_mm_unpackhi_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_unpackhi_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpackhi_epi8(__A, __B), (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpackhi_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpackhi_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpackhi_epi16(__A, __B), (__v8hi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpackhi_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpackhi_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_unpacklo_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_unpacklo_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpacklo_epi8(__A, __B), (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpacklo_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpacklo_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpacklo_epi16(__A, __B), (__v8hi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpacklo_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpacklo_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1709,7 +1701,7 @@ _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1717,7 +1709,7 @@ _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1725,7 +1717,7 @@ _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1734,7 +1726,7 @@ _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1742,7 +1734,7 @@ _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1750,7 +1742,7 @@ _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1758,7 +1750,7 @@ _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1885,7 +1877,7 @@ _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1893,7 +1885,7 @@ _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2181,7 +2173,7 @@ _mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) { (__v32qi) _mm256_setzero_si256 ()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) { return (__m128i) __builtin_ia32_selectb_128(__M, @@ -2189,7 +2181,7 @@ _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) (__v16qi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_set1_epi8 (__mmask16 __M, char __A) { return (__m128i) __builtin_ia32_selectb_128(__M, @@ -2197,7 +2189,7 @@ _mm_maskz_set1_epi8 (__mmask16 __M, char __A) (__v16qi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) { return (__m256i) __builtin_ia32_selectb_256(__M, @@ -2205,7 +2197,7 @@ _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) (__v32qi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_set1_epi8 (__mmask32 __M, char __A) { return (__m256i) __builtin_ia32_selectb_256(__M, @@ -2488,27 +2480,23 @@ _mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) _mm256_setzero_si256()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_movepi8_mask (__m128i __A) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_movepi8_mask(__m128i __A) { return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 -_mm256_movepi8_mask (__m256i __A) -{ +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_movepi8_mask(__m256i __A) { return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_movepi16_mask (__m128i __A) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_movepi16_mask(__m128i __A) { return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 -_mm256_movepi16_mask (__m256i __A) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_movepi16_mask(__m256i __A) { return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A); } @@ -2536,7 +2524,7 @@ _mm256_movm_epi16 (__mmask16 __A) return (__m256i) __builtin_ia32_cvtmask2w256 (__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectb_128(__M, @@ -2544,7 +2532,7 @@ _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) (__v16qi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectb_128(__M, @@ -2552,7 +2540,7 @@ _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) (__v16qi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectb_256(__M, @@ -2560,7 +2548,7 @@ _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) (__v32qi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectb_256(__M, @@ -2568,7 +2556,7 @@ _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) (__v32qi) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128(__M, @@ -2576,7 +2564,7 @@ _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) (__v8hi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128(__M, @@ -2584,7 +2572,7 @@ _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) (__v8hi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256(__M, @@ -2592,7 +2580,7 @@ _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) (__v16hi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256(__M, @@ -2600,7 +2588,7 @@ _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) (__v16hi) _mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) { return (__m256i) __builtin_ia32_selectw_256 (__M, @@ -2608,7 +2596,7 @@ _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) (__v16hi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_set1_epi16 (__mmask16 __M, short __A) { return (__m256i) __builtin_ia32_selectw_256(__M, @@ -2616,7 +2604,7 @@ _mm256_maskz_set1_epi16 (__mmask16 __M, short __A) (__v16hi) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) { return (__m128i) __builtin_ia32_selectw_128(__M, @@ -2624,7 +2612,7 @@ _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) (__v8hi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_set1_epi16 (__mmask8 __M, short __A) { return (__m128i) __builtin_ia32_selectw_128(__M, @@ -2632,48 +2620,41 @@ _mm_maskz_set1_epi16 (__mmask8 __M, short __A) (__v8hi) _mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutexvar_epi16 (__m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutexvar_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarhi128((__v8hi) __B, (__v8hi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, (__v8hi)_mm_permutexvar_epi16(__A, __B), (__v8hi) _mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, (__v8hi)_mm_permutexvar_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi16 (__m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutexvar_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarhi256((__v16hi) __B, (__v16hi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, (__v16hi)_mm256_permutexvar_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, (__v16hi)_mm256_permutexvar_epi16(__A, __B), (__v16hi)__W); diff --git a/clang/lib/Headers/avx512vlcdintrin.h b/clang/lib/Headers/avx512vlcdintrin.h index 7719680f..df66e1d 100644 --- a/clang/lib/Headers/avx512vlcdintrin.h +++ b/clang/lib/Headers/avx512vlcdintrin.h @@ -16,13 +16,13 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS128 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512vl,avx512cd"), \ - __min_vector_width__(128))) + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vl,avx512cd"), \ + __min_vector_width__(128))) constexpr #define __DEFAULT_FN_ATTRS256 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512vl,avx512cd"), \ - __min_vector_width__(256))) + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vl,avx512cd"), \ + __min_vector_width__(256))) constexpr #else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h index 707d039..c956aeb 100644 --- a/clang/lib/Headers/avx512vldqintrin.h +++ b/clang/lib/Headers/avx512vldqintrin.h @@ -914,15 +914,13 @@ _mm256_maskz_cvtepu64_ps(__mmask8 __U, __m256i __A) { (__v8sf)_mm256_setzero_ps(), \ (__mmask8)(U))) -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_movepi32_mask (__m128i __A) -{ +static __inline__ __mmask8 + __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_movepi32_mask(__m128i __A) { return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_movepi32_mask (__m256i __A) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_movepi32_mask(__m256i __A) { return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A); } @@ -950,15 +948,13 @@ _mm256_movm_epi64 (__mmask8 __A) return (__m256i) __builtin_ia32_cvtmask2q256 (__A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_movepi64_mask (__m128i __A) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_movepi64_mask(__m128i __A) { return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_movepi64_mask (__m256i __A) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_movepi64_mask(__m256i __A) { return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A); } diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h index 5b2b3f0..7a762e1 100644 --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -623,7 +623,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) { (__mmask16)(U))) static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) { - return __builtin_ia32_sqrtph((__v8hf)__a); + return __builtin_elementwise_sqrt(__a); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W, @@ -640,7 +640,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U, } static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) { - return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a); + return __builtin_elementwise_sqrt(__a); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 @@ -2010,24 +2010,24 @@ _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { (__v16hf)__A); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi)__B); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutexvar_ph(__m128i __A, __m128h __B) { return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutexvar_ph(__m256i __A, __m256h __B) { return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); } diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 92bb444..6e2b19c 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -347,65 +347,57 @@ _mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) { (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epi32(__X, __Y), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epi32(__X, __Y), (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epi32(__X, __Y), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epi32(__X, __Y), (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epu32(__X, __Y), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_mul_epu32(__X, __Y), (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epu32(__X, __Y), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, (__v2di)_mm_mul_epu32(__X, __Y), (__v2di)_mm_setzero_si128()); @@ -1426,56 +1418,56 @@ _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) (__v8sf) __C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_add_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_add_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_add_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_add_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_add_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_add_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_add_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_add_ps(__A, __B), @@ -1799,30 +1791,30 @@ _mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) { (__v4si)_mm_setzero_si128()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m128d __A) { return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, (__v4sf) __W, (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) { +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_cvtpd_ps(__mmask8 __U, __m128d __A) { return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, (__v4sf) _mm_setzero_ps (), (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) { +static __inline__ __m128 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m256d __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm256_cvtpd_ps(__A), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) { +static __inline__ __m128 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_cvtpd_ps(__mmask8 __U, __m256d __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm256_cvtpd_ps(__A), (__v4sf)_mm_setzero_ps()); @@ -2202,56 +2194,56 @@ _mm256_maskz_cvtepu32_ps(__mmask8 __U, __m256i __A) { (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_div_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_div_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_div_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_div_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_div_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_div_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_div_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_div_ps(__A, __B), @@ -2717,56 +2709,56 @@ _mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_mul_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_mul_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_mul_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_mul_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_mul_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_mul_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_mul_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_mul_ps(__A, __B), @@ -3500,69 +3492,69 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_sub_pd(__A, __B), (__v2df)__W); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_sub_pd(__A, __B), (__v2df)_mm_setzero_pd()); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_sub_pd(__A, __B), (__v4df)__W); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_sub_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_sub_ps(__A, __B), (__v4sf)__W); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_sub_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_sub_ps(__A, __B), (__v8sf)__W); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_sub_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I, (__v4si)__B); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3570,7 +3562,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)__A); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3578,7 +3570,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)__I); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3586,13 +3578,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)_mm_setzero_si128()); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I, (__v8si) __B); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3600,7 +3592,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)__A); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3608,7 +3600,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)__I); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3616,40 +3608,43 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)_mm256_setzero_si256()); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I, (__v2df)__B); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)__A); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)(__m128d)__I); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)_mm_setzero_pd()); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I, (__v4df)__B); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3657,7 +3652,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)__A); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3665,7 +3660,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)(__m256d)__I); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3673,47 +3668,48 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)_mm256_setzero_pd()); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I, (__v4sf)__B); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)__A); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)(__m128)__I); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)_mm_setzero_ps()); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I, (__v8sf) __B); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR + _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, + __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), (__v8sf)__A); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U, __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, @@ -3721,7 +3717,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)(__m256)__I); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, @@ -3729,13 +3725,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I, (__v2di)__B); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3743,7 +3739,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)__A); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3751,7 +3747,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)__I); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3759,14 +3755,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)_mm_setzero_si128()); } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I, (__v4di) __B); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, @@ -3774,7 +3769,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4di)__A); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, @@ -3782,7 +3777,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4di)__I); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, @@ -4304,33 +4299,29 @@ _mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) (__v4di)_mm256_ror_epi64((a), (b)), \ (__v4di)_mm256_setzero_si256())) -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_sll_epi32(__A, __B), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_sll_epi32(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_sll_epi32(__A, __B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_sll_epi32(__A, __B), (__v8si)_mm256_setzero_si256()); @@ -4367,33 +4358,29 @@ _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_sll_epi64(__A, __B), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_sll_epi64(__A, __B), (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_sll_epi64(__A, __B), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_sll_epi64(__A, __B), (__v4di)_mm256_setzero_si256()); @@ -4646,33 +4633,29 @@ _mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_srl_epi32(__A, __B), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_srl_epi32(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_srl_epi32(__A, __B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_srl_epi32(__A, __B), (__v8si)_mm256_setzero_si256()); @@ -4709,33 +4692,29 @@ _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_srl_epi64(__A, __B), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_srl_epi64(__A, __B), (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_srl_epi64(__A, __B), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_srl_epi64(__A, __B), (__v4di)_mm256_setzero_si256()); @@ -5852,65 +5831,57 @@ _mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) (__v8sf)_mm256_permute_ps((X), (C)), \ (__v8sf)_mm256_setzero_ps())) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) -{ +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_permutevar_pd(__A, __C), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) -{ +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_permutevar_pd(__A, __C), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) -{ +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_permutevar_pd(__A, __C), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) -{ +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_permutevar_pd(__A, __C), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_permutevar_ps(__A, __C), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_permutevar_ps(__A, __C), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_permutevar_ps(__A, __C), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_permutevar_ps(__A, __C), (__v8sf)_mm256_setzero_ps()); @@ -6140,33 +6111,29 @@ _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_sra_epi32(__A, __B), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_sra_epi32(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_sra_epi32(__A, __B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_sra_epi32(__A, __B), (__v8si)_mm256_setzero_si256()); @@ -6201,45 +6168,39 @@ _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_sra_epi64(__m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_sra_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ (__v2di)_mm_sra_epi64(__A, __B), \ (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ (__v2di)_mm_sra_epi64(__A, __B), \ (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sra_epi64(__m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sra_epi64(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ (__v4di)_mm256_sra_epi64(__A, __B), \ (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ (__v4di)_mm256_sra_epi64(__A, __B), \ (__v4di)_mm256_setzero_si256()); @@ -7801,47 +7762,41 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) (__v4di)_mm256_permutex_epi64((X), (C)), \ (__v4di)_mm256_setzero_si256())) -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_pd (__m256i __X, __m256d __Y) -{ +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutexvar_pd(__m256i __X, __m256d __Y) { return (__m256d)__builtin_ia32_permvardf256((__v4df)__Y, (__v4di)__X); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, - __m256d __Y) -{ +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_pd(__m256d __W, __mmask8 __U, __m256i __X, + __m256d __Y) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_permutexvar_pd(__X, __Y), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) -{ +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_pd(__mmask8 __U, __m256i __X, __m256d __Y) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_permutexvar_pd(__X, __Y), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutexvar_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_permvardi256((__v4di) __Y, (__v4di) __X); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_epi64(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_permutexvar_epi64(__X, __Y), (__v4di)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_epi64(__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_permutexvar_epi64(__X, __Y), (__v4di)__W); @@ -7849,17 +7804,15 @@ _mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, #define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A)) -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_permutexvar_ps(__X, __Y), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_permutexvar_ps(__X, __Y), (__v8sf)_mm256_setzero_ps()); @@ -7867,18 +7820,16 @@ _mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) #define _mm256_permutexvar_epi32(A, B) _mm256_permutevar8x32_epi32((B), (A)) -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) -{ + __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_permutexvar_epi32(__X, __Y), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_permutexvar_epi32(__X, __Y), (__v8si)_mm256_setzero_si256()); diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h index a1a0338..4b8a199 100644 --- a/clang/lib/Headers/avx512vlvnniintrin.h +++ b/clang/lib/Headers/avx512vlvnniintrin.h @@ -80,8 +80,8 @@ /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -#define _mm256_dpwssd_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) +#define _mm256_dpwssd_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v16hi)(A), (__v16hi)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -98,8 +98,9 @@ /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -#define _mm256_dpwssds_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) +#define _mm256_dpwssds_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v16hi)(A), \ + (__v16hi)(B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed @@ -157,8 +158,8 @@ /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -#define _mm_dpwssd_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) +#define _mm_dpwssd_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v8hi)(A), (__v8hi)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -175,8 +176,8 @@ /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -#define _mm_dpwssds_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) +#define _mm_dpwssds_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h index c386923..2ce88efe 100644 --- a/clang/lib/Headers/avx512vnniintrin.h +++ b/clang/lib/Headers/avx512vnniintrin.h @@ -68,8 +68,8 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A, + (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -91,8 +91,8 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A, + (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h index e452d5f..30df01c 100644 --- a/clang/lib/Headers/avxifmaintrin.h +++ b/clang/lib/Headers/avxifmaintrin.h @@ -17,11 +17,11 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS128 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avxifma"), __min_vector_width__(128))) + __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ + __min_vector_width__(128))) constexpr #define __DEFAULT_FN_ATTRS256 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avxifma"), __min_vector_width__(256))) + __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ + __min_vector_width__(256))) constexpr #else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h index 4aef924..9b45bc3 100644 --- a/clang/lib/Headers/avxintrin.h +++ b/clang/lib/Headers/avxintrin.h @@ -147,9 +147,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a, /// A 256-bit vector of [4 x double] containing the right source operand. /// \returns A 256-bit vector of [4 x double] containing the alternating sums /// and differences between both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_addsub_pd(__m256d __a, __m256d __b) -{ +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_addsub_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); } @@ -166,9 +165,8 @@ _mm256_addsub_pd(__m256d __a, __m256d __b) /// A 256-bit vector of [8 x float] containing the right source operand. /// \returns A 256-bit vector of [8 x float] containing the alternating sums and /// differences between both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_addsub_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_addsub_ps(__m256 __a, __m256 __b) { return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); } @@ -335,10 +333,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a, /// A 256-bit vector of [4 x double]. /// \returns A 256-bit vector of [4 x double] containing the square roots of the /// values in the operand. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_sqrt_pd(__m256d __a) -{ - return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); +static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) { + return __builtin_elementwise_sqrt(__a); } /// Calculates the square roots of the values in a 256-bit vector of @@ -352,10 +348,8 @@ _mm256_sqrt_pd(__m256d __a) /// A 256-bit vector of [8 x float]. /// \returns A 256-bit vector of [8 x float] containing the square roots of the /// values in the operand. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_sqrt_ps(__m256 __a) -{ - return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); +static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) { + return __builtin_elementwise_sqrt(__a); } /// Calculates the reciprocal square roots of the values in a 256-bit @@ -789,9 +783,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a, /// 1: Bits [127:64] of the source are copied to bits [127:64] of the /// returned vector. /// \returns A 128-bit vector of [2 x double] containing the copied values. -static __inline __m128d __DEFAULT_FN_ATTRS128 -_mm_permutevar_pd(__m128d __a, __m128i __c) -{ +static __inline __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutevar_pd(__m128d __a, __m128i __c) { return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); } @@ -828,9 +821,8 @@ _mm_permutevar_pd(__m128d __a, __m128i __c) /// 1: Bits [255:192] of the source are copied to bits [255:192] of the /// returned vector. /// \returns A 256-bit vector of [4 x double] containing the copied values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_permutevar_pd(__m256d __a, __m256i __c) -{ +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_permutevar_pd(__m256d __a, __m256i __c) { return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); } @@ -883,9 +875,8 @@ _mm256_permutevar_pd(__m256d __a, __m256i __c) /// 11: Bits [127:96] of the source are copied to bits [127:96] of the /// returned vector. /// \returns A 128-bit vector of [4 x float] containing the copied values. -static __inline __m128 __DEFAULT_FN_ATTRS128 -_mm_permutevar_ps(__m128 __a, __m128i __c) -{ +static __inline __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutevar_ps(__m128 __a, __m128i __c) { return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); } @@ -974,9 +965,8 @@ _mm_permutevar_ps(__m128 __a, __m128i __c) /// 11: Bits [255:224] of the source are copied to bits [255:224] of the /// returned vector. /// \returns A 256-bit vector of [8 x float] containing the copied values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_permutevar_ps(__m256 __a, __m256i __c) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_permutevar_ps(__m256 __a, __m256i __c) { return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); } @@ -2196,9 +2186,8 @@ _mm256_cvtepi32_ps(__m256i __a) { /// \param __a /// A 256-bit vector of [4 x double]. /// \returns A 128-bit vector of [4 x float] containing the converted values. -static __inline __m128 __DEFAULT_FN_ATTRS -_mm256_cvtpd_ps(__m256d __a) -{ +static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_cvtpd_ps(__m256d __a) { return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); } @@ -3616,9 +3605,7 @@ _mm256_undefined_pd(void) /// This intrinsic has no corresponding instruction. /// /// \returns A 256-bit vector of [8 x float] containing undefined values. -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_undefined_ps(void) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_undefined_ps(void) { return (__m256)__builtin_ia32_undef256(); } diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h index 805d249..98d94ee 100644 --- a/clang/lib/Headers/avxvnniint16intrin.h +++ b/clang/lib/Headers/avxvnniint16intrin.h @@ -16,9 +16,10 @@ #define __AVXVNNIINT16INTRIN_H /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with -/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// corresponding unsigned 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W, and store the packed 32-bit +/// results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -40,19 +41,21 @@ /// \code{.operation} /// FOR j := 0 to 3 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) -/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 /// ENDFOR /// dst[MAX:128] := 0 /// \endcode #define _mm_dpwsud_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v8hi)(__A), \ + (__v8hu)(__B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with -/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// corresponding unsigned 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W, and store the packed 32-bit +/// results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -74,20 +77,21 @@ /// \code{.operation} /// FOR j := 0 to 7 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) -/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) -/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +/// tmp2.dword := +/// SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 /// ENDFOR /// dst[MAX:256] := 0 /// \endcode #define _mm256_dpwsud_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v16hi)(__A), \ + (__v16hu)(__B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with -/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W with signed saturation, and store the packed -/// 32-bit results in \a dst. +/// corresponding unsigned 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W with signed saturation, and store +/// the packed 32-bit results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -109,20 +113,22 @@ /// \code{.operation} /// FOR j := 0 to 3 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) -/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with #define _mm_dpwsuds_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v8hi)(__A), \ + (__v8hu)(__B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with -/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W with signed saturation, and store the packed -/// 32-bit results in \a dst. +/// corresponding unsigned 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W with signed saturation, and store +/// the packed 32-bit results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -144,19 +150,21 @@ /// \code{.operation} /// FOR j := 0 to 7 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) -/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) /// ENDFOR /// dst[MAX:256] := 0 /// \endcode #define _mm256_dpwsuds_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v16hi)(__A), \ + (__v16hu)(__B))) -/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with -/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A +/// with corresponding signed 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W, and store the packed 32-bit +/// results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -178,19 +186,21 @@ /// \code{.operation} /// FOR j := 0 to 3 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) -/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 /// ENDFOR /// dst[MAX:128] := 0 /// \endcode #define _mm_dpwusd_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v8hu)(__A), \ + (__v8hi)(__B))) -/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with -/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A +/// with corresponding signed 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W, and store the packed 32-bit +/// results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -212,20 +222,21 @@ /// \code{.operation} /// FOR j := 0 to 7 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) -/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 /// ENDFOR /// dst[MAX:256] := 0 /// \endcode #define _mm256_dpwusd_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v16hu)(__A), \ + (__v16hi)(__B))) -/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with -/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W with signed saturation, and store the packed -/// 32-bit results in \a dst. +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A +/// with corresponding signed 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W with signed saturation, and +/// store the packed 32-bit results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -233,7 +244,7 @@ /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B) /// \endcode /// -/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// This intrinsic corresponds to the \c VPDPWUSDS instruction. /// /// \param __W /// A 128-bit vector of [4 x int]. @@ -247,20 +258,21 @@ /// \code{.operation} /// FOR j := 0 to 3 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) -/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) /// ENDFOR /// dst[MAX:128] := 0 /// \endcode #define _mm_dpwusds_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v8hu)(__A), \ + (__v8hi)(__B))) -/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with -/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W with signed saturation, and store the packed -/// 32-bit results in \a dst. +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A +/// with corresponding signed 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W with signed saturation, and +/// store the packed 32-bit results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -268,7 +280,7 @@ /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) /// \endcode /// -/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// This intrinsic corresponds to the \c VPDPWUSDS instruction. /// /// \param __W /// A 256-bit vector of [8 x int]. @@ -282,19 +294,21 @@ /// \code{.operation} /// FOR j := 0 to 7 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) -/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) /// ENDFOR /// dst[MAX:256] := 0 /// \endcode #define _mm256_dpwusds_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v16hu)(__A), \ + (__v16hi)(__B))) -/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with -/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A +/// with corresponding unsigned 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W, and store the packed 32-bit +/// results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -305,30 +319,32 @@ /// This intrinsic corresponds to the \c VPDPWUUD instruction. /// /// \param __W -/// A 128-bit vector of [4 x unsigned int]. +/// A 128-bit vector of [4 x int]. /// \param __A /// A 128-bit vector of [8 x unsigned short]. /// \param __B /// A 128-bit vector of [8 x unsigned short]. /// \returns -/// A 128-bit vector of [4 x unsigned int]. +/// A 128-bit vector of [4 x int]. /// /// \code{.operation} /// FOR j := 0 to 3 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) -/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 /// ENDFOR /// dst[MAX:128] := 0 /// \endcode #define _mm_dpwuud_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v8hu)(__A), \ + (__v8hu)(__B))) -/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with -/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A +/// with corresponding unsigned 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W, and store the packed 32-bit +/// results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -339,31 +355,32 @@ /// This intrinsic corresponds to the \c VPDPWUUD instruction. /// /// \param __W -/// A 256-bit vector of [8 x unsigned int]. +/// A 256-bit vector of [8 x int]. /// \param __A /// A 256-bit vector of [16 x unsigned short]. /// \param __B /// A 256-bit vector of [16 x unsigned short]. /// \returns -/// A 256-bit vector of [8 x unsigned int]. +/// A 256-bit vector of [8 x int]. /// /// \code{.operation} /// FOR j := 0 to 7 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) -/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 /// ENDFOR /// dst[MAX:256] := 0 /// \endcode #define _mm256_dpwuud_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v16hu)(__A), \ + (__v16hu)(__B))) -/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with -/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W with signed saturation, and store the packed -/// 32-bit results in \a dst. +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A +/// with corresponding unsigned 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W with signed saturation, and store +/// the packed 32-bit results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -371,34 +388,35 @@ /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B) /// \endcode /// -/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// This intrinsic corresponds to the \c VPDPWUUDS instruction. /// /// \param __W -/// A 128-bit vector of [4 x unsigned int]. +/// A 128-bit vector of [4 x int]. /// \param __A /// A 128-bit vector of [8 x unsigned short]. /// \param __B /// A 128-bit vector of [8 x unsigned short]. /// \returns -/// A 128-bit vector of [4 x unsigned int]. +/// A 128-bit vector of [4 x int]. /// /// \code{.operation} /// FOR j := 0 to 3 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) -/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) /// ENDFOR /// dst[MAX:128] := 0 /// \endcode #define _mm_dpwuuds_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v8hu)(__A), \ + (__v8hu)(__B))) -/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with -/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate -/// signed 16-bit results. Sum these 2 results with the corresponding -/// 32-bit integer in \a __W with signed saturation, and store the packed -/// 32-bit results in \a dst. +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A +/// with corresponding unsigned 16-bit integers in \a __B, producing 2 +/// intermediate signed 16-bit results. Sum these 2 results with the +/// corresponding 32-bit integer in \a __W with signed saturation, and store +/// the packed 32-bit results in \a dst. /// /// \headerfile <immintrin.h> /// @@ -406,27 +424,28 @@ /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) /// \endcode /// -/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// This intrinsic corresponds to the \c VPDPWUUDS instruction. /// /// \param __W -/// A 256-bit vector of [8 x unsigned int]. +/// A 256-bit vector of [8 x int]. /// \param __A /// A 256-bit vector of [16 x unsigned short]. /// \param __B /// A 256-bit vector of [16 x unsigned short]. /// \returns -/// A 256-bit vector of [8 x unsigned int]. +/// A 256-bit vector of [8 x int]. /// /// \code{.operation} /// FOR j := 0 to 7 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) -/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) +/// tmp2.dword := +/// ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) /// ENDFOR /// dst[MAX:256] := 0 /// \endcode #define _mm256_dpwuuds_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v16hu)(__A), \ + (__v16hu)(__B))) #endif // __AVXVNNIINT16INTRIN_H diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h index 3c4c44a..1d2e8c9 100644 --- a/clang/lib/Headers/avxvnniintrin.h +++ b/clang/lib/Headers/avxvnniintrin.h @@ -109,7 +109,8 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B); + return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v16hi)__A, + (__v16hi)__B); } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with @@ -130,7 +131,8 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B); + return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v16hi)__A, + (__v16hi)__B); } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with @@ -199,7 +201,8 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B); + return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v8hi)__A, + (__v8hi)__B); } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with @@ -220,7 +223,8 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B); + return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v8hi)__A, + (__v8hi)__B); } #undef __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h index 45700c6..156425c 100644 --- a/clang/lib/Headers/cpuid.h +++ b/clang/lib/Headers/cpuid.h @@ -253,10 +253,6 @@ #define bit_RDPRU 0x00000010 #define bit_WBNOINVD 0x00000200 -/* Features in %ebx for leaf 0x24 */ -#define bit_AVX10_256 0x00020000 -#define bit_AVX10_512 0x00040000 - #ifdef __i386__ #define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \ __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \ @@ -282,6 +278,24 @@ : "0"(__leaf), "2"(__count)) #endif +/// Queries the processor to determine the highest supported \c CPUID leaf. +/// This intrinsic is only available on x86 and x64. +/// +/// \headerfile <cpuid.h> +/// +/// This intrinsic corresponds to the <c> CPUID </c> instruction. +/// +/// \param __leaf +/// \a __leaf can be either 0x0 or 0x8000000. If \a __leaf == 0x0, the +/// highest supported value for basic \c CPUID information is returned. +/// If \a __leaf == 0x8000000, the highest supported value for extended +/// \c CPUID information is returned. +/// \param __sig +/// If the \a __sig pointer is non-null, the first four bytes of the +/// signature (as found in the \c EBX register) are returned in the +/// location pointed to by \a __sig. +/// \returns Returns 0 if \c CPUID is supported; otherwise returns the value +/// that \c CPUID returns in the \c EAX register. static __inline unsigned int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig) { @@ -315,6 +329,32 @@ static __inline unsigned int __get_cpuid_max (unsigned int __leaf, return __eax; } +/// For the requested \c CPUID leaf, queries the processor for information +/// about the CPU type and CPU features (such as processor vendor, supported +/// instruction sets, CPU capabilities, cache sizes, CPU model and family, and +/// other hardware details). This intrinsic is only available on x86 and x64. +/// +/// \headerfile <cpuid.h> +/// +/// This intrinsic corresponds to the <c> CPUID </c> instruction. +/// +/// \param __leaf +/// An unsigned integer that identifies the level (also called "leaf") at +/// which the \c CPUID instruction will be executed. +/// \param __eax +/// A pointer to an integer that corresponds to the \c EAX register where +/// \c CPUID stores output results. +/// \param __ebx +/// A pointer to an integer that corresponds to the \c EBX register where +/// \c CPUID stores output results. +/// \param __ecx +/// A pointer to an integer that corresponds to the \c ECX register where +/// \c CPUID stores output results. +/// \param __edx +/// A pointer to an integer that corresponds to the \c EDX register where +/// \c CPUID stores output results. +/// \returns Returns 1 if the requested \c CPUID leaf is supported; otherwise +/// returns 0. static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax, unsigned int *__ebx, unsigned int *__ecx, unsigned int *__edx) @@ -328,6 +368,36 @@ static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax, return 1; } +/// For the requested \c CPUID leaf and subleaf, queries the processor for +/// information about the CPU type and CPU features (such as processor vendor, +/// supported instruction sets, CPU capabilities, cache sizes, CPU model and +/// family, and other hardware details). This intrinsic is only available on +/// x86 and x64. +/// +/// \headerfile <cpuid.h> +/// +/// This intrinsic corresponds to the <c> CPUID </c> instruction. +/// +/// \param __leaf +/// An unsigned integer that identifies the level (also called "leaf") at +/// which the \c CPUID instruction will be executed. +/// \param __subleaf +/// An unsigned integer that identifies the sublevel (also called +/// "subleaf") at which the \c CPUID instruction will be executed. +/// \param __eax +/// A pointer to an integer that corresponds to the \c EAX register where +/// \c CPUID stores output results. +/// \param __ebx +/// A pointer to an integer that corresponds to the \c EBX register where +/// \c CPUID stores output results. +/// \param __ecx +/// A pointer to an integer that corresponds to the \c ECX register where +/// \c CPUID stores output results. +/// \param __edx +/// A pointer to an integer that corresponds to the \c EDX register where +/// \c CPUID stores output results. +/// \returns Returns 1 if the requested \c CPUID leaf is supported; otherwise +/// returns 0. static __inline int __get_cpuid_count (unsigned int __leaf, unsigned int __subleaf, unsigned int *__eax, unsigned int *__ebx, @@ -349,6 +419,28 @@ static __inline int __get_cpuid_count (unsigned int __leaf, // builtin. Given __has_builtin does not detect builtins on aux triples, we need // to explicitly check for some offloading cases. #if !defined(__NVPTX__) && !defined(__AMDGPU__) && !defined(__SPIRV__) +/// Executes the \c CPUID instruction with the specified leaf and subleaf +/// values, and returns the results from the CPU's registers. This intrinsic +/// is only available on x86 and x64. +/// +/// \headerfile <cpuid.h> +/// +/// This intrinsic corresponds to the <c> CPUID </c> instruction. +/// +/// \param __cpu_info +/// An output array of four integers: +/// <ul> +/// <li>\a __cpuInfo[0] receives the value of the \c EAX register.</li> +/// <li>\a __cpuInfo[1] receives the value of the \c EBX register.</li> +/// <li>\a __cpuInfo[2] receives the value of the \c ECX register.</li> +/// <li>\a __cpuInfo[3] receives the value of the \c EDX register.</li> +/// </ul> +/// \param __leaf +/// An unsigned integer that identifies the level (also called the "leaf") +/// at which the \c CPUID instruction will be executed. +/// \param __subleaf +/// An unsigned integer that identifies the sublevel (also called the +/// "subleaf") at which the \c CPUID instruction will be executed. static __inline void __cpuidex(int __cpu_info[4], int __leaf, int __subleaf) { __cpuid_count(__leaf, __subleaf, __cpu_info[0], __cpu_info[1], __cpu_info[2], __cpu_info[3]); diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index dbe5ca0..1ca7097 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -241,8 +241,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a, /// bits are copied from the upper 64 bits of operand \a __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b) { - __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); - return __extension__(__m128d){__c[0], __a[1]}; + return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]}; } /// Calculates the square root of the each of two values stored in a @@ -257,7 +256,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, /// \returns A 128-bit vector of [2 x double] containing the square roots of the /// values in the operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) { - return __builtin_ia32_sqrtpd((__v2df)__a); + return __builtin_elementwise_sqrt(__a); } /// Compares lower 64-bit double-precision values of both operands, and @@ -1279,7 +1278,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, /// A 128-bit vector of [2 x double]. /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted values. The upper 64 bits are set to zero. -static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) { +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtpd_ps(__m128d __a) { return __builtin_ia32_cvtpd2ps((__v2df)__a); } @@ -1384,8 +1384,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) { /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the /// converted value from the second parameter. The upper 96 bits are copied /// from the upper 96 bits of the first parameter. -static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, - __m128d __b) { +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtsd_ss(__m128 __a, __m128d __b) { return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); } @@ -2783,8 +2783,8 @@ _mm_slli_epi16(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to left-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sll_epi16(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); } @@ -2819,8 +2819,8 @@ _mm_slli_epi32(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to left-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sll_epi32(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); } @@ -2855,8 +2855,8 @@ _mm_slli_epi64(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to left-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sll_epi64(__m128i __a, __m128i __count) { return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); } @@ -2893,8 +2893,8 @@ _mm_srai_epi16(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sra_epi16(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); } @@ -2931,8 +2931,8 @@ _mm_srai_epi32(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sra_epi32(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); } @@ -2992,8 +2992,8 @@ _mm_srli_epi16(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srl_epi16(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); } @@ -3028,8 +3028,8 @@ _mm_srli_epi32(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srl_epi32(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); } @@ -3064,8 +3064,8 @@ _mm_srli_epi64(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srl_epi64(__m128i __a, __m128i __count) { return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); } diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h index e0a0e4c..20b8030 100644 --- a/clang/lib/Headers/fma4intrin.h +++ b/clang/lib/Headers/fma4intrin.h @@ -40,16 +40,14 @@ _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) { (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) { + return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], __C[0])); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) { + return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], __C[0])); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR @@ -64,16 +62,14 @@ _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) { + return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], -__C[0])); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) { + return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], -__C[0])); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR @@ -88,16 +84,14 @@ _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) { + return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], __C[0])); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) { + return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], __C[0])); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR @@ -112,16 +106,14 @@ _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) { + return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0])); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) { + return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0])); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h index c510090..eba527f 100644 --- a/clang/lib/Headers/fmaintrin.h +++ b/clang/lib/Headers/fmaintrin.h @@ -95,10 +95,10 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) /// 32 bits. /// \returns A 128-bit vector of [4 x float] containing the result in the low /// 32 bits and a copy of \a __A[127:32] in the upper 96 bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) { + __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]); + return __A; } /// Computes a scalar multiply-add of the double-precision values in the @@ -124,10 +124,10 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) /// 64 bits. /// \returns A 128-bit vector of [2 x double] containing the result in the low /// 64 bits and a copy of \a __A[127:64] in the upper 64 bits. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) { + __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]); + return __A; } /// Computes a multiply-subtract of 128-bit vectors of [4 x float]. @@ -195,10 +195,10 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) /// 32 bits. /// \returns A 128-bit vector of [4 x float] containing the result in the low /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) { + __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]); + return __A; } /// Computes a scalar multiply-subtract of the double-precision values in @@ -224,10 +224,10 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) /// 64 bits. /// \returns A 128-bit vector of [2 x double] containing the result in the low /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) { + __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]); + return __A; } /// Computes a negated multiply-add of 128-bit vectors of [4 x float]. @@ -295,10 +295,10 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) /// 32 bits. /// \returns A 128-bit vector of [4 x float] containing the result in the low /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) { + __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]); + return __A; } /// Computes a scalar negated multiply-add of the double-precision values @@ -324,10 +324,10 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) /// 64 bits. /// \returns A 128-bit vector of [2 x double] containing the result in the low /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) { + __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]); + return __A; } /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float]. @@ -395,10 +395,10 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) /// 32 bits. /// \returns A 128-bit vector of [4 x float] containing the result in the low /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) { + __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]); + return __A; } /// Computes a scalar negated multiply-subtract of the double-precision @@ -424,10 +424,10 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) /// 64 bits. /// \returns A 128-bit vector of [2 x double] containing the result in the low /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) { + __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]); + return __A; } /// Computes a multiply with alternating add/subtract of 128-bit vectors of diff --git a/clang/lib/Headers/gfniintrin.h b/clang/lib/Headers/gfniintrin.h index 1df1eac..2c559f1 100644 --- a/clang/lib/Headers/gfniintrin.h +++ b/clang/lib/Headers/gfniintrin.h @@ -15,6 +15,35 @@ #define __GFNIINTRIN_H /* Default attributes for simple form (no masking). */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("gfni"), \ + __min_vector_width__(128))) constexpr + +/* Default attributes for YMM unmasked form. */ +#define __DEFAULT_FN_ATTRS_Y \ + __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), \ + __min_vector_width__(256))) constexpr + +/* Default attributes for VLX masked forms. */ +#define __DEFAULT_FN_ATTRS_VL128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512bw,avx512vl,gfni"), \ + __min_vector_width__(128))) constexpr +#define __DEFAULT_FN_ATTRS_VL256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512bw,avx512vl,gfni"), \ + __min_vector_width__(256))) constexpr + +/* Default attributes for ZMM unmasked forms. */ +#define __DEFAULT_FN_ATTRS_Z \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), \ + __min_vector_width__(512))) constexpr +/* Default attributes for ZMM masked forms. */ +#define __DEFAULT_FN_ATTRS_Z_MASK \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), \ + __min_vector_width__(512))) constexpr +#else #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("gfni"), \ __min_vector_width__(128))) @@ -42,6 +71,7 @@ #define __DEFAULT_FN_ATTRS_Z_MASK \ __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), \ __min_vector_width__(512))) +#endif #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \ ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h index a918af3..38b95ee9 100644 --- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h @@ -1053,76 +1053,25 @@ _HLSL_BUILTIN_ALIAS(__builtin_elementwise_exp2) float4 exp2(float4); //===----------------------------------------------------------------------===// -// firstbithigh builtins +// f16tof32 builtins //===----------------------------------------------------------------------===// -/// \fn T firstbithigh(T Val) -/// \brief Returns the location of the first set bit starting from the highest -/// order bit and working downward, per component. -/// \param Val the input value. - -#ifdef __HLSL_ENABLE_16_BIT -_HLSL_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint firstbithigh(int16_t); -_HLSL_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint2 firstbithigh(int16_t2); -_HLSL_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint3 firstbithigh(int16_t3); -_HLSL_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint4 firstbithigh(int16_t4); -_HLSL_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint firstbithigh(uint16_t); -_HLSL_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint2 firstbithigh(uint16_t2); -_HLSL_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint3 firstbithigh(uint16_t3); -_HLSL_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint4 firstbithigh(uint16_t4); -#endif +/// \fn float f16tof32(uint x) +/// \brief Returns the half value stored in the low 16 bits of the uint arg +/// converted to a float. +/// \param x The uint containing two half values. +/// +/// The float value of the half value found in the low 16 bits of the \a xi +/// parameter. -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint firstbithigh(int); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint2 firstbithigh(int2); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint3 firstbithigh(int3); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint4 firstbithigh(int4); - -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint firstbithigh(uint); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint2 firstbithigh(uint2); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint3 firstbithigh(uint3); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint4 firstbithigh(uint4); - -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint firstbithigh(int64_t); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint2 firstbithigh(int64_t2); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint3 firstbithigh(int64_t3); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint4 firstbithigh(int64_t4); - -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint firstbithigh(uint64_t); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint2 firstbithigh(uint64_t2); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint3 firstbithigh(uint64_t3); -_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh) -uint4 firstbithigh(uint64_t4); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float f16tof32(uint); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float2 f16tof32(uint2); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float3 f16tof32(uint3); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float4 f16tof32(uint4); //===----------------------------------------------------------------------===// // firstbitlow builtins @@ -2090,9 +2039,17 @@ T select(bool, T, T); /// \param FalseVals The vector values are chosen from when conditions are /// false. -template <typename T, int Sz> +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 2> select(vector<bool, 2>, vector<T, 2>, vector<T, 2>); + +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 3> select(vector<bool, 3>, vector<T, 3>, vector<T, 3>); + +template <typename T> _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, vector<T, Sz>); +vector<T, 4> select(vector<bool, 4>, vector<T, 4>, vector<T, 4>); /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, T TrueVal, /// vector<T,Sz> FalseVals) @@ -2102,9 +2059,17 @@ vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, vector<T, Sz>); /// \param FalseVals The vector values are chosen from when conditions are /// false. -template <typename T, int Sz> +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 2> select(vector<bool, 2>, T, vector<T, 2>); + +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 3> select(vector<bool, 3>, T, vector<T, 3>); + +template <typename T> _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -vector<T, Sz> select(vector<bool, Sz>, T, vector<T, Sz>); +vector<T, 4> select(vector<bool, 4>, T, vector<T, 4>); /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, vector<T,Sz> TrueVals, /// T FalseVal) @@ -2113,9 +2078,17 @@ vector<T, Sz> select(vector<bool, Sz>, T, vector<T, Sz>); /// \param TrueVals The vector values are chosen from when conditions are true. /// \param FalseVal The scalar value to splat from when conditions are false. -template <typename T, int Sz> +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 2> select(vector<bool, 2>, vector<T, 2>, T); + +template <typename T> _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, T); +vector<T, 3> select(vector<bool, 3>, vector<T, 3>, T); + +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 4> select(vector<bool, 4>, vector<T, 4>, T); /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, vector<T,Sz> TrueVals, /// T FalseVal) @@ -2124,10 +2097,20 @@ vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, T); /// \param TrueVal The scalar value to splat from when conditions are true. /// \param FalseVal The scalar value to splat from when conditions are false. -template <typename T, int Sz> +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 2>> select( + vector<bool, 2>, T, T); + +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 3>> select( + vector<bool, 3>, T, T); + +template <typename T> _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, Sz>> select( - vector<bool, Sz>, T, T); +__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 4>> select( + vector<bool, 4>, T, T); //===----------------------------------------------------------------------===// // sin builtins @@ -2963,5 +2946,73 @@ float4 radians(float4); _HLSL_BUILTIN_ALIAS(__builtin_hlsl_group_memory_barrier_with_group_sync) __attribute__((convergent)) void GroupMemoryBarrierWithGroupSync(void); +//===----------------------------------------------------------------------===// +// ddx_coarse builtin +//===----------------------------------------------------------------------===// + +/// \fn T ddx_coarse(T value) +/// \brief Computes a low precision partial derivative with respect to the +/// screen-space x-coordinate. +/// \param value The input value. +/// +/// The return value is a floating point scalar or vector containing the low +/// prevision partial derivative of the input value. + +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse) +half ddx_coarse(half); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse) +half2 ddx_coarse(half2); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse) +half3 ddx_coarse(half3); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse) +half4 ddx_coarse(half4); + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse) +float ddx_coarse(float); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse) +float2 ddx_coarse(float2); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse) +float3 ddx_coarse(float3); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddx_coarse) +float4 ddx_coarse(float4); + +//===----------------------------------------------------------------------===// +// ddy_coarse builtin +//===----------------------------------------------------------------------===// + +/// \fn T ddy_coarse(T value) +/// \brief Computes a low precision partial derivative with respect to the +/// screen-space y-coordinate. +/// \param value The input value. +/// +/// The return value is a floating point scalar or vector containing the low +/// prevision partial derivative of the input value. + +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse) +half ddy_coarse(half); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse) +half2 ddy_coarse(half2); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse) +half3 ddy_coarse(half3); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse) +half4 ddy_coarse(half4); + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse) +float ddy_coarse(float); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse) +float2 ddy_coarse(float2); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse) +float3 ddy_coarse(float3); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_ddy_coarse) +float4 ddy_coarse(float4); + } // namespace hlsl #endif //_HLSL_HLSL_ALIAS_INTRINSICS_H_ diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h index fe4277e..ee243ab 100644 --- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h +++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #ifndef _HLSL_COMPAT_OVERLOADS_H_ -#define _HLSl_COMPAT_OVERLOADS_H_ +#define _HLSL_COMPAT_OVERLOADS_H_ namespace hlsl { diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h index c877234..3550409 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h @@ -137,17 +137,37 @@ template <typename T> constexpr vector<T, 4> lit_impl(T NDotL, T NDotH, T M) { } template <typename T> constexpr T faceforward_impl(T N, T I, T Ng) { -#if (__has_builtin(__builtin_spirv_faceforward)) - return __builtin_spirv_faceforward(N, I, Ng); -#else return select(dot(I, Ng) < 0, N, -N); -#endif } template <typename T> constexpr T ldexp_impl(T X, T Exp) { return exp2(Exp) * X; } +template <typename K, typename T, int BitWidth> +constexpr K firstbithigh_impl(T X) { + K FBH = __builtin_hlsl_elementwise_firstbithigh(X); +#if defined(__DIRECTX__) + // The firstbithigh DXIL ops count bits from the wrong side, so we need to + // invert it for DirectX. + K Inversion = (BitWidth - 1) - FBH; + FBH = select(FBH == -1, FBH, Inversion); +#endif + return FBH; +} + +template <typename T> constexpr T fwidth_impl(T input) { +#if (__has_builtin(__builtin_spirv_fwidth)) + return __builtin_spirv_fwidth(input); +#else + T derivCoarseX = ddx_coarse(input); + derivCoarseX = abs(derivCoarseX); + T derivCoarseY = ddy_coarse(input); + derivCoarseY = abs(derivCoarseY); + return derivCoarseX + derivCoarseY; +#endif +} + } // namespace __detail } // namespace hlsl diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index 5ba5bfb..a538be5 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -262,6 +262,67 @@ faceforward(__detail::HLSL_FIXED_VECTOR<float, L> N, } //===----------------------------------------------------------------------===// +// firstbithigh builtins +//===----------------------------------------------------------------------===// + +/// \fn T firstbithigh(T Val) +/// \brief Returns the location of the first set bit starting from the lowest +/// order bit and working upward, per component. +/// \param Val the input value. + +#ifdef __HLSL_ENABLE_16_BIT + +template <typename T> +_HLSL_AVAILABILITY(shadermodel, 6.2) +const inline __detail::enable_if_t<__detail::is_same<int16_t, T>::value || + __detail::is_same<uint16_t, T>::value, + uint> firstbithigh(T X) { + return __detail::firstbithigh_impl<uint, T, 16>(X); +} + +template <typename T, int N> +_HLSL_AVAILABILITY(shadermodel, 6.2) +const + inline __detail::enable_if_t<__detail::is_same<int16_t, T>::value || + __detail::is_same<uint16_t, T>::value, + vector<uint, N>> firstbithigh(vector<T, N> X) { + return __detail::firstbithigh_impl<vector<uint, N>, vector<T, N>, 16>(X); +} + +#endif + +template <typename T> +const inline __detail::enable_if_t< + __detail::is_same<int, T>::value || __detail::is_same<uint, T>::value, uint> +firstbithigh(T X) { + return __detail::firstbithigh_impl<uint, T, 32>(X); +} + +template <typename T, int N> +const inline __detail::enable_if_t<__detail::is_same<int, T>::value || + __detail::is_same<uint, T>::value, + vector<uint, N>> +firstbithigh(vector<T, N> X) { + return __detail::firstbithigh_impl<vector<uint, N>, vector<T, N>, 32>(X); +} + +template <typename T> +const inline __detail::enable_if_t<__detail::is_same<int64_t, T>::value || + __detail::is_same<uint64_t, T>::value, + uint> +firstbithigh(T X) { + return __detail::firstbithigh_impl<uint, T, 64>(X); +} + +template <typename T, int N> +const inline __detail::enable_if_t<__detail::is_same<int64_t, T>::value || + __detail::is_same<uint64_t, T>::value, + vector<uint, N>> +firstbithigh(vector<T, N> X) { + return __detail::firstbithigh_impl<vector<uint, N>, vector<T, N>, 64>(X); +} + +//===----------------------------------------------------------------------===// // fmod builtins //===----------------------------------------------------------------------===// @@ -605,5 +666,49 @@ smoothstep(__detail::HLSL_FIXED_VECTOR<float, N> Min, return __detail::smoothstep_vec_impl(Min, Max, X); } +inline bool CheckAccessFullyMapped(uint Status) { + return static_cast<bool>(Status); +} + +//===----------------------------------------------------------------------===// +// fwidth builtin +//===----------------------------------------------------------------------===// + +/// \fn T fwidth(T x) +/// \brief Computes the sum of the absolute values of the partial derivatives +/// with regard to the x and y screen space coordinates. +/// \param x [in] The floating-point scalar or vector to process. +/// +/// The return value is a floating-point scalar or vector where each element +/// holds the computation of the matching element in the input. + +template <typename T> +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +const inline __detail::enable_if_t<__detail::is_arithmetic<T>::Value && + __detail::is_same<half, T>::value, + T> fwidth(T input) { + return __detail::fwidth_impl(input); +} + +template <typename T> +const inline __detail::enable_if_t< + __detail::is_arithmetic<T>::Value && __detail::is_same<float, T>::value, T> +fwidth(T input) { + return __detail::fwidth_impl(input); +} + +template <int N> +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +const inline __detail::HLSL_FIXED_VECTOR<half, N> fwidth( + __detail::HLSL_FIXED_VECTOR<half, N> input) { + return __detail::fwidth_impl(input); +} + +template <int N> +const inline __detail::HLSL_FIXED_VECTOR<float, N> +fwidth(__detail::HLSL_FIXED_VECTOR<float, N> input) { + return __detail::fwidth_impl(input); +} + } // namespace hlsl #endif //_HLSL_HLSL_INTRINSICS_H_ diff --git a/clang/lib/Headers/hvx_hexagon_protos.h b/clang/lib/Headers/hvx_hexagon_protos.h index fd120a5..19309a4 100644 --- a/clang/lib/Headers/hvx_hexagon_protos.h +++ b/clang/lib/Headers/hvx_hexagon_protos.h @@ -5605,6 +5605,399 @@ __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_f8)(Vu, Vv) #endif /* __HEXAGON_ARCH___ >= 79 */ +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vabs(Vu32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vabs_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vabs(Vu32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vabs_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vabs_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf16_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vabs(Vu32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vabs_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vabs(Vu32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vabs_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vabs_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_qf32_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32=valign4(Vu32,Vv32,Rt8) + C Intrinsic Prototype: HVX_Vector Q6_V_valign4_VVR(HVX_Vector Vu, HVX_Vector + Vv, Word32 Rt) Instruction Type: CVI_VA Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_V_valign4_VVR(Vu, Vv, Rt) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_valign4)(Vu, Vv, Rt) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.bf=Vuu32.qf32 + C Intrinsic Prototype: HVX_Vector Q6_Vbf_equals_Wqf32(HVX_VectorPair Vuu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vbf_equals_Wqf32(Vuu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_bf_qf32)(Vuu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.f8=Vu32.qf16 + C Intrinsic Prototype: HVX_Vector Q6_V_equals_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_V_equals_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_f8_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.h=Vu32.hf:rnd + C Intrinsic Prototype: HVX_Vector Q6_Vh_equals_Vhf_rnd(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vh_equals_Vhf_rnd(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_h_hf_rnd)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vdd32.qf16=Vu32.f8 + C Intrinsic Prototype: HVX_VectorPair Q6_Wqf16_equals_V(HVX_Vector Vu) + Instruction Type: CVI_VP_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Wqf16_equals_V(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_f8)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=Vu32.hf + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_equals_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=Vu32.qf16 + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_equals_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_equals_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf16_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=Vu32.qf32 + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_equals_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=Vu32.sf + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_equals_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_equals_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_qf32_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qd4=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VhfVhf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VA Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eq_VhfVhf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf)(Vu, Vv)), -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4&=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVhfVhf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqand_QVhfVhf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_and)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4|=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVhfVhf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqor_QVhfVhf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_or)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4^=vcmp.eq(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVhfVhf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqxacc_QVhfVhf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqhf_xor)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qd4=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eq_VsfVsf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VA Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eq_VsfVsf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf)(Vu, Vv)), -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4&=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqand_QVsfVsf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqand_QVsfVsf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_and)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4|=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqor_QVsfVsf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqor_QVsfVsf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_or)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Qx4^=vcmp.eq(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_eqxacc_QVsfVsf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_eqxacc_QVsfVsf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)( \ + (__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_veqsf_xor)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.w=vilog2(Vu32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vw_vilog2_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_vilog2_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vilog2_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vneg(Vu32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vneg_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vneg(Vu32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vneg_Vqf16(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vneg_Vqf16(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf16_qf16)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vneg(Vu32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vqf32(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vneg_Vqf32(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_qf32)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vneg(Vu32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vneg_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vneg_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vneg_qf32_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vsub(Vu32.hf,Vv32.qf16) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vsub_VhfVqf16(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VS Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf16_vsub_VhfVqf16(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_mix)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 81 */ + +#if __HVX_ARCH__ >= 81 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vsub(Vu32.sf,Vv32.qf32) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vsub_VsfVqf32(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VS Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vqf32_vsub_VsfVqf32(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_sf_mix)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 81 */ + #endif /* __HVX__ */ #endif diff --git a/clang/lib/Headers/lasxintrin.h b/clang/lib/Headers/lasxintrin.h index 85020d8..83cc428 100644 --- a/clang/lib/Headers/lasxintrin.h +++ b/clang/lib/Headers/lasxintrin.h @@ -10,6 +10,8 @@ #ifndef _LOONGSON_ASXINTRIN_H #define _LOONGSON_ASXINTRIN_H 1 +#include <lsxintrin.h> + #if defined(__loongarch_asx) typedef signed char v32i8 __attribute__((vector_size(32), aligned(32))); @@ -3882,5 +3884,116 @@ extern __inline #define __lasx_xvrepli_w(/*si10*/ _1) ((__m256i)__builtin_lasx_xvrepli_w((_1))) +#if defined(__loongarch_asx_sx_conv) + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) __m256 __lasx_cast_128_s(__m128 _1) { + return (__m256)__builtin_lasx_cast_128_s((v4f32)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d + __lasx_cast_128_d(__m128d _1) { + return (__m256d)__builtin_lasx_cast_128_d((v2f64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i + __lasx_cast_128(__m128i _1) { + return (__m256i)__builtin_lasx_cast_128((v2i64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 + __lasx_concat_128_s(__m128 _1, __m128 _2) { + return (__m256)__builtin_lasx_concat_128_s((v4f32)_1, (v4f32)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d + __lasx_concat_128_d(__m128d _1, __m128d _2) { + return (__m256d)__builtin_lasx_concat_128_d((v2f64)_1, (v2f64)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i + __lasx_concat_128(__m128i _1, __m128i _2) { + return (__m256i)__builtin_lasx_concat_128((v2i64)_1, (v2i64)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128 + __lasx_extract_128_lo_s(__m256 _1) { + return (__m128)__builtin_lasx_extract_128_lo_s((v8f32)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d + __lasx_extract_128_lo_d(__m256d _1) { + return (__m128d)__builtin_lasx_extract_128_lo_d((v4f64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i + __lasx_extract_128_lo(__m256i _1) { + return (__m128i)__builtin_lasx_extract_128_lo((v4i64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128 + __lasx_extract_128_hi_s(__m256 _1) { + return (__m128)__builtin_lasx_extract_128_hi_s((v8f32)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d + __lasx_extract_128_hi_d(__m256d _1) { + return (__m128d)__builtin_lasx_extract_128_hi_d((v4f64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i + __lasx_extract_128_hi(__m256i _1) { + return (__m128i)__builtin_lasx_extract_128_hi((v4i64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 + __lasx_insert_128_lo_s(__m256 _1, __m128 _2) { + return (__m256)__builtin_lasx_insert_128_lo_s((v8f32)_1, (v4f32)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d + __lasx_insert_128_lo_d(__m256d _1, __m128d _2) { + return (__m256d)__builtin_lasx_insert_128_lo_d((v4f64)_1, (v2f64)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i + __lasx_insert_128_lo(__m256i _1, __m128i _2) { + return (__m256i)__builtin_lasx_insert_128_lo((v4i64)_1, (v2i64)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 + __lasx_insert_128_hi_s(__m256 _1, __m128 _2) { + return (__m256)__builtin_lasx_insert_128_hi_s((v8f32)_1, (v4f32)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d + __lasx_insert_128_hi_d(__m256d _1, __m128d _2) { + return (__m256d)__builtin_lasx_insert_128_hi_d((v4f64)_1, (v2f64)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i + __lasx_insert_128_hi(__m256i _1, __m128i _2) { + return (__m256i)__builtin_lasx_insert_128_hi((v4i64)_1, (v2i64)_2); +} + +#endif /* defined(__loongarch_asx_sx_conv). */ #endif /* defined(__loongarch_asx). */ #endif /* _LOONGSON_ASXINTRIN_H. */ diff --git a/clang/lib/Headers/llvm_libc_wrappers/assert.h b/clang/lib/Headers/llvm_libc_wrappers/assert.h index 610ed96..7eadb2c 100644 --- a/clang/lib/Headers/llvm_libc_wrappers/assert.h +++ b/clang/lib/Headers/llvm_libc_wrappers/assert.h @@ -19,13 +19,11 @@ #if defined(__HIP__) || defined(__CUDA__) #define __LIBC_ATTRS __attribute__((device)) +#else +#define __LIBC_ATTRS #endif -#pragma omp begin declare target - -#include <llvm-libc-decls/assert.h> - -#pragma omp end declare target +// TODO: Define these for CUDA / HIP. #undef __LIBC_ATTRS diff --git a/clang/lib/Headers/llvm_libc_wrappers/ctype.h b/clang/lib/Headers/llvm_libc_wrappers/ctype.h index 960cf43..79b0c1e 100644 --- a/clang/lib/Headers/llvm_libc_wrappers/ctype.h +++ b/clang/lib/Headers/llvm_libc_wrappers/ctype.h @@ -13,128 +13,16 @@ #error "This file is for GPU offloading compilation only" #endif -// The GNU headers like to define 'toupper' and 'tolower' redundantly. This is -// necessary to prevent it from doing that and remapping our implementation. -#if (defined(__NVPTX__) || defined(__AMDGPU__)) && defined(__GLIBC__) -#pragma push_macro("__USE_EXTERN_INLINES") -#undef __USE_EXTERN_INLINES -#endif - #include_next <ctype.h> -#if (defined(__NVPTX__) || defined(__AMDGPU__)) && defined(__GLIBC__) -#pragma pop_macro("__USE_EXTERN_INLINES") -#endif - -#if __has_include(<llvm-libc-decls/ctype.h>) - #if defined(__HIP__) || defined(__CUDA__) #define __LIBC_ATTRS __attribute__((device)) +#else +#define __LIBC_ATTRS #endif -// The GNU headers like to provide these as macros, we need to undefine them so -// they do not conflict with the following definitions for the GPU. - -#pragma push_macro("isalnum") -#pragma push_macro("isalpha") -#pragma push_macro("isascii") -#pragma push_macro("isblank") -#pragma push_macro("iscntrl") -#pragma push_macro("isdigit") -#pragma push_macro("isgraph") -#pragma push_macro("islower") -#pragma push_macro("isprint") -#pragma push_macro("ispunct") -#pragma push_macro("isspace") -#pragma push_macro("isupper") -#pragma push_macro("isxdigit") -#pragma push_macro("toascii") -#pragma push_macro("tolower") -#pragma push_macro("toupper") -#pragma push_macro("isalnum_l") -#pragma push_macro("isalpha_l") -#pragma push_macro("isascii_l") -#pragma push_macro("isblank_l") -#pragma push_macro("iscntrl_l") -#pragma push_macro("isdigit_l") -#pragma push_macro("isgraph_l") -#pragma push_macro("islower_l") -#pragma push_macro("isprint_l") -#pragma push_macro("ispunct_l") -#pragma push_macro("isspace_l") -#pragma push_macro("isupper_l") -#pragma push_macro("isxdigit_l") - -#undef isalnum -#undef isalpha -#undef isascii -#undef iscntrl -#undef isdigit -#undef islower -#undef isgraph -#undef isprint -#undef ispunct -#undef isspace -#undef isupper -#undef isblank -#undef isxdigit -#undef toascii -#undef tolower -#undef toupper -#undef isalnum_l -#undef isalpha_l -#undef iscntrl_l -#undef isdigit_l -#undef islower_l -#undef isgraph_l -#undef isprint_l -#undef ispunct_l -#undef isspace_l -#undef isupper_l -#undef isblank_l -#undef isxdigit_l - -#pragma omp begin declare target - -#include <llvm-libc-decls/ctype.h> - -#pragma omp end declare target - -// Restore the original macros when compiling on the host. -#if !defined(__NVPTX__) && !defined(__AMDGPU__) -#pragma pop_macro("isalnum") -#pragma pop_macro("isalpha") -#pragma pop_macro("isascii") -#pragma pop_macro("isblank") -#pragma pop_macro("iscntrl") -#pragma pop_macro("isdigit") -#pragma pop_macro("isgraph") -#pragma pop_macro("islower") -#pragma pop_macro("isprint") -#pragma pop_macro("ispunct") -#pragma pop_macro("isspace") -#pragma pop_macro("isupper") -#pragma pop_macro("isxdigit") -#pragma pop_macro("toascii") -#pragma pop_macro("tolower") -#pragma pop_macro("toupper") -#pragma pop_macro("isalnum_l") -#pragma pop_macro("isalpha_l") -#pragma pop_macro("isascii_l") -#pragma pop_macro("isblank_l") -#pragma pop_macro("iscntrl_l") -#pragma pop_macro("isdigit_l") -#pragma pop_macro("isgraph_l") -#pragma pop_macro("islower_l") -#pragma pop_macro("isprint_l") -#pragma pop_macro("ispunct_l") -#pragma pop_macro("isspace_l") -#pragma pop_macro("isupper_l") -#pragma pop_macro("isxdigit_l") -#endif +// TODO: Define these for CUDA / HIP. #undef __LIBC_ATTRS -#endif - #endif // __CLANG_LLVM_LIBC_WRAPPERS_CTYPE_H__ diff --git a/clang/lib/Headers/llvm_libc_wrappers/inttypes.h b/clang/lib/Headers/llvm_libc_wrappers/inttypes.h index 415f1e4..2261389 100644 --- a/clang/lib/Headers/llvm_libc_wrappers/inttypes.h +++ b/clang/lib/Headers/llvm_libc_wrappers/inttypes.h @@ -19,13 +19,11 @@ #if defined(__HIP__) || defined(__CUDA__) #define __LIBC_ATTRS __attribute__((device)) +#else +#define __LIBC_ATTRS #endif -#pragma omp begin declare target - -#include <llvm-libc-decls/inttypes.h> - -#pragma omp end declare target +// TODO: Define these for CUDA / HIP. #undef __LIBC_ATTRS diff --git a/clang/lib/Headers/llvm_libc_wrappers/llvm-libc-decls/README.txt b/clang/lib/Headers/llvm_libc_wrappers/llvm-libc-decls/README.txt deleted file mode 100644 index e012cd9..0000000 --- a/clang/lib/Headers/llvm_libc_wrappers/llvm-libc-decls/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -LLVM libc declarations -====================== - -This directory will be filled by the `libc` project with declarations that are -availible on the device. Each declaration will use the `__LIBC_ATTRS` attribute -to control emission on the device side. diff --git a/clang/lib/Headers/llvm_libc_wrappers/stdio.h b/clang/lib/Headers/llvm_libc_wrappers/stdio.h index 950f91b..0c3e448 100644 --- a/clang/lib/Headers/llvm_libc_wrappers/stdio.h +++ b/clang/lib/Headers/llvm_libc_wrappers/stdio.h @@ -6,45 +6,19 @@ // //===----------------------------------------------------------------------===// +#ifndef __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__ +#define __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__ + #if !defined(_OPENMP) && !defined(__HIP__) && !defined(__CUDA__) #error "This file is for GPU offloading compilation only" #endif #include_next <stdio.h> -// In some old versions of glibc, other standard headers sometimes define -// special macros (e.g., __need_FILE) before including stdio.h to cause stdio.h -// to produce special definitions. Future includes of stdio.h when those -// special macros are undefined are expected to produce the normal definitions -// from stdio.h. -// -// We do not apply our include guard (__CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__) -// unconditionally to the above include_next. Otherwise, after an occurrence of -// the first glibc stdio.h use case described above, the include_next would be -// skipped for remaining includes of stdio.h, leaving required symbols -// undefined. -// -// We make the following assumptions to handle all use cases: -// -// 1. If the above include_next produces special glibc definitions, then (a) it -// does not produce the normal definitions that we must intercept below, (b) -// the current file was included from a glibc header that already defined -// __GLIBC__ (usually by including glibc's <features.h>), and (c) the above -// include_next does not define _STDIO_H. In that case, we skip the rest of -// the current file and don't guard against future includes. -// 2. If the above include_next produces the normal stdio.h definitions, then -// either (a) __GLIBC__ is not defined because C headers are from some other -// libc implementation or (b) the above include_next defines _STDIO_H to -// prevent the above include_next from having any effect in the future. -#if !defined(__GLIBC__) || defined(_STDIO_H) - -#ifndef __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__ -#define __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__ - -#if __has_include(<llvm-libc-decls/stdio.h>) - #if defined(__HIP__) || defined(__CUDA__) #define __LIBC_ATTRS __attribute__((device)) +#else +#define __LIBC_ATTRS #endif // Some headers provide these as macros. Temporarily undefine them so they do @@ -60,21 +34,19 @@ #pragma omp begin declare target -#include <llvm-libc-decls/stdio.h> +__LIBC_ATTRS extern FILE *stderr; +__LIBC_ATTRS extern FILE *stdin; +__LIBC_ATTRS extern FILE *stdout; #pragma omp end declare target -#undef __LIBC_ATTRS - // Restore the original macros when compiling on the host. #if !defined(__NVPTX__) && !defined(__AMDGPU__) -#pragma pop_macro("stdout") #pragma pop_macro("stderr") #pragma pop_macro("stdin") +#pragma pop_macro("stdout") #endif -#endif +#undef __LIBC_ATTRS #endif // __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__ - -#endif diff --git a/clang/lib/Headers/llvm_libc_wrappers/stdlib.h b/clang/lib/Headers/llvm_libc_wrappers/stdlib.h index 1da22abd0..7af5e2e 100644 --- a/clang/lib/Headers/llvm_libc_wrappers/stdlib.h +++ b/clang/lib/Headers/llvm_libc_wrappers/stdlib.h @@ -15,39 +15,18 @@ #include_next <stdlib.h> -#if __has_include(<llvm-libc-decls/stdlib.h>) - #if defined(__HIP__) || defined(__CUDA__) #define __LIBC_ATTRS __attribute__((device)) +#else +#define __LIBC_ATTRS #endif #pragma omp begin declare target -// The LLVM C library uses these named types so we forward declare them. -typedef void (*__atexithandler_t)(void); -typedef int (*__search_compare_t)(const void *, const void *); -typedef int (*__qsortcompare_t)(const void *, const void *); -typedef int (*__qsortrcompare_t)(const void *, const void *, void *); - -// Enforce ABI compatibility with the structs used by the LLVM C library. -_Static_assert(__builtin_offsetof(div_t, quot) == 0, "ABI mismatch!"); -_Static_assert(__builtin_offsetof(ldiv_t, quot) == 0, "ABI mismatch!"); -_Static_assert(__builtin_offsetof(lldiv_t, quot) == 0, "ABI mismatch!"); - -#if defined(__GLIBC__) && __cplusplus >= 201703L -#define at_quick_exit atexit -#endif - -#include <llvm-libc-decls/stdlib.h> - -#if defined(__GLIBC__) && __cplusplus >= 201703L -#undef at_quick_exit -#endif +// TODO: Define these for CUDA / HIP. #pragma omp end declare target #undef __LIBC_ATTRS -#endif - #endif // __CLANG_LLVM_LIBC_WRAPPERS_STDLIB_H__ diff --git a/clang/lib/Headers/llvm_libc_wrappers/string.h b/clang/lib/Headers/llvm_libc_wrappers/string.h index 0ea49cb1..766a58f 100644 --- a/clang/lib/Headers/llvm_libc_wrappers/string.h +++ b/clang/lib/Headers/llvm_libc_wrappers/string.h @@ -15,82 +15,14 @@ #include_next <string.h> -#if __has_include(<llvm-libc-decls/string.h>) - #if defined(__HIP__) || defined(__CUDA__) #define __LIBC_ATTRS __attribute__((device)) -#endif - -#pragma omp begin declare target - -// The GNU headers provide C++ standard compliant headers when in C++ mode and -// the LLVM libc does not. We need to manually provide the definitions using the -// same prototypes. -#if defined(__cplusplus) && defined(__GLIBC__) && \ - defined(__CORRECT_ISO_CPP_STRING_H_PROTO) - -#ifndef __LIBC_ATTRS -#define __LIBC_ATTRS -#endif - -extern "C" { -void *memccpy(void *__restrict, const void *__restrict, int, - size_t) __LIBC_ATTRS; -int memcmp(const void *, const void *, size_t) __LIBC_ATTRS; -void *memcpy(void *__restrict, const void *__restrict, size_t) __LIBC_ATTRS; -void *memmem(const void *, size_t, const void *, size_t) __LIBC_ATTRS; -void *memmove(void *, const void *, size_t) __LIBC_ATTRS; -void *mempcpy(void *__restrict, const void *__restrict, size_t) __LIBC_ATTRS; -void *memset(void *, int, size_t) __LIBC_ATTRS; -char *stpcpy(char *__restrict, const char *__restrict) __LIBC_ATTRS; -char *stpncpy(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS; -char *strcat(char *__restrict, const char *__restrict) __LIBC_ATTRS; -int strcmp(const char *, const char *) __LIBC_ATTRS; -int strcoll(const char *, const char *) __LIBC_ATTRS; -char *strcpy(char *__restrict, const char *__restrict) __LIBC_ATTRS; -size_t strcspn(const char *, const char *) __LIBC_ATTRS; -char *strdup(const char *) __LIBC_ATTRS; -size_t strlen(const char *) __LIBC_ATTRS; -char *strncat(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS; -int strncmp(const char *, const char *, size_t) __LIBC_ATTRS; -char *strncpy(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS; -char *strndup(const char *, size_t) __LIBC_ATTRS; -size_t strnlen(const char *, size_t) __LIBC_ATTRS; -size_t strspn(const char *, const char *) __LIBC_ATTRS; -char *strtok(char *__restrict, const char *__restrict) __LIBC_ATTRS; -char *strtok_r(char *__restrict, const char *__restrict, - char **__restrict) __LIBC_ATTRS; -size_t strxfrm(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS; -} - -extern "C++" { -char *strstr(char *, const char *) noexcept __LIBC_ATTRS; -const char *strstr(const char *, const char *) noexcept __LIBC_ATTRS; -char *strpbrk(char *, const char *) noexcept __LIBC_ATTRS; -const char *strpbrk(const char *, const char *) noexcept __LIBC_ATTRS; -char *strrchr(char *, int) noexcept __LIBC_ATTRS; -const char *strrchr(const char *, int) noexcept __LIBC_ATTRS; -char *strchr(char *, int) noexcept __LIBC_ATTRS; -const char *strchr(const char *, int) noexcept __LIBC_ATTRS; -char *strchrnul(char *, int) noexcept __LIBC_ATTRS; -const char *strchrnul(const char *, int) noexcept __LIBC_ATTRS; -char *strcasestr(char *, const char *) noexcept __LIBC_ATTRS; -const char *strcasestr(const char *, const char *) noexcept __LIBC_ATTRS; -void *memrchr(void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS; -const void *memrchr(const void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS; -void *memchr(void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS; -const void *memchr(const void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS; -} - #else -#include <llvm-libc-decls/string.h> - +#define __LIBC_ATTRS #endif -#pragma omp end declare target +// TODO: Define these for CUDA / HIP. #undef __LIBC_ATTRS -#endif - #endif // __CLANG_LLVM_LIBC_WRAPPERS_STRING_H__ diff --git a/clang/lib/Headers/llvm_libc_wrappers/time.h b/clang/lib/Headers/llvm_libc_wrappers/time.h index 9d1340c..d38eea3 100644 --- a/clang/lib/Headers/llvm_libc_wrappers/time.h +++ b/clang/lib/Headers/llvm_libc_wrappers/time.h @@ -15,20 +15,14 @@ #include_next <time.h> -#if __has_include(<llvm-libc-decls/time.h>) - #if defined(__HIP__) || defined(__CUDA__) #define __LIBC_ATTRS __attribute__((device)) +#else +#define __LIBC_ATTRS #endif -#pragma omp begin declare target - -_Static_assert(sizeof(clock_t) == sizeof(long), "ABI mismatch!"); - -#include <llvm-libc-decls/time.h> +// TODO: Define these for CUDA / HIP. -#pragma omp end declare target - -#endif +#undef __LIBC_ATTRS #endif // __CLANG_LLVM_LIBC_WRAPPERS_TIME_H__ diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index aca78e6..2cf4645 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -39,14 +39,14 @@ typedef short __v8hi __attribute__((__vector_size__(16))); typedef char __v16qi __attribute__((__vector_size__(16))); /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS_SSE2 \ __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ - __min_vector_width__(128))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr + __min_vector_width__(128))) constexpr #else -#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 +#define __DEFAULT_FN_ATTRS_SSE2 \ + __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ + __min_vector_width__(128))) #endif #define __trunc64(x) \ @@ -54,9 +54,6 @@ typedef char __v16qi __attribute__((__vector_size__(16))); #define __zext128(x) \ (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ 1, 2, 3) -#define __anyext128(x) \ - (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ - 1, -1, -1) /// Clears the MMX state by setting the state of the x87 stack registers /// to empty. @@ -82,10 +79,8 @@ static __inline__ void /// A 32-bit integer value. /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the /// parameter. The upper 32 bits are set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cvtsi32_si64(int __i) -{ - return __extension__ (__m64)(__v2si){__i, 0}; +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi32_si64(int __i) { + return __extension__(__m64)(__v2si){__i, 0}; } /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit @@ -99,10 +94,8 @@ _mm_cvtsi32_si64(int __i) /// A 64-bit integer vector. /// \returns A 32-bit signed integer value containing the lower 32 bits of the /// parameter. -static __inline__ int __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cvtsi64_si32(__m64 __m) -{ - return ((__v2si)__m)[0]; +static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_si32(__m64 __m) { + return ((__v2si)__m)[0]; } /// Casts a 64-bit signed integer value into a 64-bit integer vector. @@ -115,10 +108,8 @@ _mm_cvtsi64_si32(__m64 __m) /// A 64-bit signed integer. /// \returns A 64-bit integer vector containing the same bitwise pattern as the /// parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cvtsi64_m64(long long __i) -{ - return __extension__ (__m64)(__v1di){__i}; +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_m64(long long __i) { + return __extension__(__m64)(__v1di){__i}; } /// Casts a 64-bit integer vector into a 64-bit signed integer value. @@ -131,10 +122,8 @@ _mm_cvtsi64_m64(long long __i) /// A 64-bit integer vector. /// \returns A 64-bit signed integer containing the same bitwise pattern as the /// parameter. -static __inline__ long long __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cvtm64_si64(__m64 __m) -{ - return ((__v1di)__m)[0]; +static __inline__ long long __DEFAULT_FN_ATTRS_SSE2 _mm_cvtm64_si64(__m64 __m) { + return ((__v1di)__m)[0]; } /// Converts, with saturation, 16-bit signed integers from both 64-bit integer @@ -156,8 +145,8 @@ _mm_cvtm64_si64(__m64 __m) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_packs_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, + __m64 __m2) { return __trunc64(__builtin_ia32_packsswb128( (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } @@ -181,8 +170,8 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2) { /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_packs_pi32(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi32(__m64 __m1, + __m64 __m2) { return __trunc64(__builtin_ia32_packssdw128( (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){})); } @@ -206,8 +195,8 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2) { /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_packs_pu16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pu16(__m64 __m1, + __m64 __m2) { return __trunc64(__builtin_ia32_packuswb128( (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } @@ -233,8 +222,8 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2) { /// Bits [63:56] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 4, 12, 5, 13, 6, 14, 7, 15); } @@ -256,8 +245,8 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { /// Bits [63:48] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 2, 6, 3, 7); } @@ -276,8 +265,8 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi32(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3); } @@ -302,8 +291,8 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { /// Bits [31:24] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 0, 8, 1, 9, 2, 10, 3, 11); } @@ -325,8 +314,8 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { /// Bits [31:16] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 0, 4, 1, 5); } @@ -345,8 +334,8 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi32(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2); } @@ -365,10 +354,9 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_add_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi8(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2)); } /// Adds each 16-bit integer element of the first 64-bit integer vector @@ -386,10 +374,9 @@ _mm_add_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_add_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi16(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2)); } /// Adds each 32-bit integer element of the first 64-bit integer vector @@ -407,10 +394,9 @@ _mm_add_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_add_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v2su)__m1) + ((__v2su)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi32(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v2su)__m1) + ((__v2su)__m2)); } /// Adds, with saturation, each 8-bit signed integer element of the first @@ -431,8 +417,8 @@ _mm_add_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_adds_pi8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2); } @@ -454,8 +440,8 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_adds_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2); } @@ -476,8 +462,8 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_adds_pu8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2); } @@ -498,8 +484,8 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_adds_pu16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2); } @@ -518,10 +504,9 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_sub_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi8(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2)); } /// Subtracts each 16-bit integer element of the second 64-bit integer @@ -539,10 +524,9 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_sub_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi16(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2)); } /// Subtracts each 32-bit integer element of the second 64-bit integer @@ -560,10 +544,9 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32] containing the subtrahends. /// \returns A 64-bit integer vector of [2 x i32] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_sub_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v2su)__m1) - ((__v2su)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi32(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v2su)__m1) - ((__v2su)__m2)); } /// Subtracts, with saturation, each 8-bit signed integer element of the second @@ -584,8 +567,8 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_subs_pi8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2); } @@ -607,8 +590,8 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_subs_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2); } @@ -630,8 +613,8 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_subs_pu8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2); } @@ -653,8 +636,8 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_subs_pu16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2); } @@ -679,8 +662,8 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of /// products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_madd_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_madd_pi16(__m64 __m1, + __m64 __m2) { return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1), (__v8hi)__zext128(__m2))); } @@ -700,11 +683,10 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_mulhi_pi16(__m64 __m1, __m64 __m2) -{ - return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__zext128(__m1), - (__v8hi)__zext128(__m2))); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pi16(__m64 __m1, + __m64 __m2) { + return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__zext128(__m1), + (__v8hi)__zext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -722,10 +704,9 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_mullo_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mullo_pi16(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2)); } /// Left-shifts each 16-bit signed integer element of the first @@ -748,8 +729,8 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2) static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi16(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m), - (__v8hi)__anyext128(__count))); + return __trunc64(__builtin_ia32_psllw128((__v8hi)__zext128(__m), + (__v8hi)__zext128(__count))); } /// Left-shifts each 16-bit signed integer element of a 64-bit integer @@ -768,8 +749,8 @@ _mm_sll_pi16(__m64 __m, __m64 __count) /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted /// values. If \a __count is greater or equal to 16, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_slli_pi16(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi16(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psllwi128((__v8hi)__zext128(__m), __count)); } @@ -793,8 +774,8 @@ _mm_slli_pi16(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi32(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m), - (__v4si)__anyext128(__count))); + return __trunc64(__builtin_ia32_pslld128((__v4si)__zext128(__m), + (__v4si)__zext128(__count))); } /// Left-shifts each 32-bit signed integer element of a 64-bit integer @@ -813,8 +794,8 @@ _mm_sll_pi32(__m64 __m, __m64 __count) /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted /// values. If \a __count is greater or equal to 32, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_slli_pi32(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi32(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_pslldi128((__v4si)__zext128(__m), __count)); } @@ -835,8 +816,8 @@ _mm_slli_pi32(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_si64(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m), - (__v2di)__anyext128(__count))); + return __trunc64(__builtin_ia32_psllq128((__v2di)__zext128(__m), + (__v2di)__zext128(__count))); } /// Left-shifts the first parameter, which is a 64-bit integer, by the @@ -853,8 +834,8 @@ _mm_sll_si64(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the left-shifted value. If /// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_slli_si64(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_si64(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psllqi128((__v2di)__zext128(__m), __count)); } @@ -879,8 +860,8 @@ _mm_slli_si64(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi16(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m), - (__v8hi)__anyext128(__count))); + return __trunc64(__builtin_ia32_psraw128((__v8hi)__zext128(__m), + (__v8hi)__zext128(__count))); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -900,8 +881,8 @@ _mm_sra_pi16(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_srai_pi16(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi16(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psrawi128((__v8hi)__zext128(__m), __count)); } @@ -926,8 +907,8 @@ _mm_srai_pi16(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi32(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m), - (__v4si)__anyext128(__count))); + return __trunc64(__builtin_ia32_psrad128((__v4si)__zext128(__m), + (__v4si)__zext128(__count))); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -947,8 +928,8 @@ _mm_sra_pi32(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_srai_pi32(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi32(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psradi128((__v4si)__zext128(__m), __count)); } @@ -972,8 +953,8 @@ _mm_srai_pi32(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi16(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m), - (__v8hi)__anyext128(__count))); + return __trunc64(__builtin_ia32_psrlw128((__v8hi)__zext128(__m), + (__v8hi)__zext128(__count))); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -992,8 +973,8 @@ _mm_srl_pi16(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_srli_pi16(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi16(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__zext128(__m), __count)); } @@ -1017,8 +998,8 @@ _mm_srli_pi16(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi32(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m), - (__v4si)__anyext128(__count))); + return __trunc64(__builtin_ia32_psrld128((__v4si)__zext128(__m), + (__v4si)__zext128(__count))); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -1037,8 +1018,8 @@ _mm_srl_pi32(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_srli_pi32(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi32(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psrldi128((__v4si)__zext128(__m), __count)); } @@ -1059,8 +1040,8 @@ _mm_srli_pi32(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_si64(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m), - (__v2di)__anyext128(__count))); + return __trunc64(__builtin_ia32_psrlq128((__v2di)__zext128(__m), + (__v2di)__zext128(__count))); } /// Right-shifts the first parameter, which is a 64-bit integer, by the @@ -1078,8 +1059,8 @@ _mm_srl_si64(__m64 __m, __m64 __count) /// \param __count /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_srli_si64(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_si64(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psrlqi128((__v2di)__zext128(__m), __count)); } @@ -1095,10 +1076,9 @@ _mm_srli_si64(__m64 __m, int __count) { /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_and_si64(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v1du)__m1) & ((__v1du)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_and_si64(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v1du)__m1) & ((__v1du)__m2)); } /// Performs a bitwise NOT of the first 64-bit integer vector, and then @@ -1116,10 +1096,9 @@ _mm_and_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of the second /// parameter and the one's complement of the first parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_andnot_si64(__m64 __m1, __m64 __m2) -{ - return (__m64)(~((__v1du)__m1) & ((__v1du)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_andnot_si64(__m64 __m1, + __m64 __m2) { + return (__m64)(~((__v1du)__m1) & ((__v1du)__m2)); } /// Performs a bitwise OR of two 64-bit integer vectors. @@ -1134,10 +1113,9 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_or_si64(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v1du)__m1) | ((__v1du)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_or_si64(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v1du)__m1) | ((__v1du)__m2)); } /// Performs a bitwise exclusive OR of two 64-bit integer vectors. @@ -1152,10 +1130,9 @@ _mm_or_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_xor_si64(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_xor_si64(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2)); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of @@ -1174,10 +1151,9 @@ _mm_xor_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi8(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2)); } /// Compares the 16-bit integer elements of two 64-bit integer vectors of @@ -1196,10 +1172,9 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi16(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2)); } /// Compares the 32-bit integer elements of two 64-bit integer vectors of @@ -1218,10 +1193,9 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v2si)__m1) == ((__v2si)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi32(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v2si)__m1) == ((__v2si)__m2)); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of @@ -1240,9 +1214,8 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) -{ +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi8(__m64 __m1, + __m64 __m2) { /* This function always performs a signed comparison, but __v8qi is a char which may be signed or unsigned, so use __v8qs. */ return (__m64)((__v8qs)__m1 > (__v8qs)__m2); @@ -1264,10 +1237,9 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)((__v4hi)__m1 > (__v4hi)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi16(__m64 __m1, + __m64 __m2) { + return (__m64)((__v4hi)__m1 > (__v4hi)__m2); } /// Compares the 32-bit integer elements of two 64-bit integer vectors of @@ -1286,10 +1258,9 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)((__v2si)__m1 > (__v2si)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi32(__m64 __m1, + __m64 __m2) { + return (__m64)((__v2si)__m1 > (__v2si)__m2); } /// Constructs a 64-bit integer vector initialized to zero. @@ -1299,8 +1270,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) /// This intrinsic corresponds to the <c> PXOR </c> instruction. /// /// \returns An initialized 64-bit integer vector with all elements set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_setzero_si64(void) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setzero_si64(void) { return __extension__(__m64){0LL}; } @@ -1319,8 +1289,8 @@ _mm_setzero_si64(void) { /// A 32-bit integer value used to initialize the lower 32 bits of the /// result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_set_pi32(int __i1, int __i0) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi32(int __i1, + int __i0) { return __extension__(__m64)(__v2si){__i0, __i1}; } @@ -1341,8 +1311,10 @@ _mm_set_pi32(int __i1, int __i0) { /// \param __s0 /// A 16-bit integer value used to initialize bits [15:0] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_set_pi16(short __s3, short __s2, short __s1, short __s0) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi16(short __s3, + short __s2, + short __s1, + short __s0) { return __extension__(__m64)(__v4hi){__s0, __s1, __s2, __s3}; } @@ -1371,7 +1343,7 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) { /// \param __b0 /// An 8-bit integer value used to initialize bits [7:0] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) { return __extension__(__m64)(__v8qi){__b0, __b1, __b2, __b3, @@ -1391,8 +1363,7 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, /// A 32-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [2 x i32]. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_set1_pi32(int __i) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi32(int __i) { return _mm_set_pi32(__i, __i); } @@ -1409,8 +1380,7 @@ _mm_set1_pi32(int __i) { /// A 16-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [4 x i16]. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_set1_pi16(short __w) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi16(short __w) { return _mm_set_pi16(__w, __w, __w, __w); } @@ -1426,8 +1396,7 @@ _mm_set1_pi16(short __w) { /// An 8-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [8 x i8]. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_set1_pi8(char __b) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi8(char __b) { return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); } @@ -1446,8 +1415,8 @@ _mm_set1_pi8(char __b) { /// A 32-bit integer value used to initialize the upper 32 bits of the /// result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_setr_pi32(int __i0, int __i1) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi32(int __i0, + int __i1) { return _mm_set_pi32(__i1, __i0); } @@ -1468,8 +1437,10 @@ _mm_setr_pi32(int __i0, int __i1) { /// \param __w3 /// A 16-bit integer value used to initialize bits [63:48] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi16(short __w0, + short __w1, + short __w2, + short __w3) { return _mm_set_pi16(__w3, __w2, __w1, __w0); } @@ -1498,13 +1469,12 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { /// \param __b7 /// An 8-bit integer value used to initialize bits [63:56] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7) { return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); } -#undef __anyext128 #undef __trunc64 #undef __DEFAULT_FN_ATTRS_SSE2 diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap index 2e4d533..c13dd3f 100644 --- a/clang/lib/Headers/module.modulemap +++ b/clang/lib/Headers/module.modulemap @@ -253,6 +253,11 @@ module _Builtin_stdbool [system] { export * } +module _Builtin_stdckdint [system] { + header "stdckdint.h" + export * +} + module _Builtin_stdcountof [system] { header "stdcountof.h" export * diff --git a/clang/lib/Headers/opencl-c-base.h b/clang/lib/Headers/opencl-c-base.h index 414f10a..898026c 100644 --- a/clang/lib/Headers/opencl-c-base.h +++ b/clang/lib/Headers/opencl-c-base.h @@ -9,105 +9,6 @@ #ifndef _OPENCL_BASE_H_ #define _OPENCL_BASE_H_ -// Define extension macros - -#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200) -// For SPIR and SPIR-V all extensions are supported. -#if defined(__SPIR__) || defined(__SPIRV__) -#define cl_khr_subgroup_extended_types 1 -#define cl_khr_subgroup_non_uniform_vote 1 -#define cl_khr_subgroup_ballot 1 -#define cl_khr_subgroup_non_uniform_arithmetic 1 -#define cl_khr_subgroup_shuffle 1 -#define cl_khr_subgroup_shuffle_relative 1 -#define cl_khr_subgroup_clustered_reduce 1 -#define cl_khr_subgroup_rotate 1 -#define cl_khr_extended_bit_ops 1 -#define cl_khr_integer_dot_product 1 -#define __opencl_c_integer_dot_product_input_4x8bit 1 -#define __opencl_c_integer_dot_product_input_4x8bit_packed 1 -#define cl_ext_float_atomics 1 -#ifdef cl_khr_fp16 -#define __opencl_c_ext_fp16_global_atomic_load_store 1 -#define __opencl_c_ext_fp16_local_atomic_load_store 1 -#define __opencl_c_ext_fp16_global_atomic_add 1 -#define __opencl_c_ext_fp16_local_atomic_add 1 -#define __opencl_c_ext_fp16_global_atomic_min_max 1 -#define __opencl_c_ext_fp16_local_atomic_min_max 1 -#endif -#ifdef cl_khr_fp64 -#define __opencl_c_ext_fp64_global_atomic_add 1 -#define __opencl_c_ext_fp64_local_atomic_add 1 -#define __opencl_c_ext_fp64_global_atomic_min_max 1 -#define __opencl_c_ext_fp64_local_atomic_min_max 1 -#endif -#define __opencl_c_ext_fp32_global_atomic_add 1 -#define __opencl_c_ext_fp32_local_atomic_add 1 -#define __opencl_c_ext_fp32_global_atomic_min_max 1 -#define __opencl_c_ext_fp32_local_atomic_min_max 1 -#define __opencl_c_ext_image_raw10_raw12 1 -#define __opencl_c_ext_image_unorm_int_2_101010 1 -#define __opencl_c_ext_image_unsigned_10x6_12x4_14x2 1 -#define cl_khr_kernel_clock 1 -#define __opencl_c_kernel_clock_scope_device 1 -#define __opencl_c_kernel_clock_scope_work_group 1 -#define __opencl_c_kernel_clock_scope_sub_group 1 - -#endif // defined(__SPIR__) || defined(__SPIRV__) -#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200) - -// Define feature macros for OpenCL C 2.0 -#if (__OPENCL_CPP_VERSION__ == 100 || __OPENCL_C_VERSION__ == 200) -#define __opencl_c_pipes 1 -#define __opencl_c_generic_address_space 1 -#define __opencl_c_work_group_collective_functions 1 -#define __opencl_c_atomic_order_acq_rel 1 -#define __opencl_c_atomic_order_seq_cst 1 -#define __opencl_c_atomic_scope_device 1 -#define __opencl_c_atomic_scope_all_devices 1 -#define __opencl_c_device_enqueue 1 -#define __opencl_c_read_write_images 1 -#define __opencl_c_program_scope_global_variables 1 -#define __opencl_c_images 1 -#endif - -// Define header-only feature macros for OpenCL C 3.0. -#if (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300) -// For the SPIR and SPIR-V target all features are supported. -#if defined(__SPIR__) || defined(__SPIRV__) -#define __opencl_c_work_group_collective_functions 1 -#define __opencl_c_atomic_order_seq_cst 1 -#define __opencl_c_atomic_scope_device 1 -#define __opencl_c_atomic_scope_all_devices 1 -#define __opencl_c_read_write_images 1 -#endif // defined(__SPIR__) - -#endif // (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300) - -// Undefine any feature macros that have been explicitly disabled using -// an __undef_<feature> macro. -#ifdef __undef___opencl_c_work_group_collective_functions -#undef __opencl_c_work_group_collective_functions -#endif -#ifdef __undef___opencl_c_atomic_order_seq_cst -#undef __opencl_c_atomic_order_seq_cst -#endif -#ifdef __undef___opencl_c_atomic_scope_device -#undef __opencl_c_atomic_scope_device -#endif -#ifdef __undef___opencl_c_atomic_scope_all_devices -#undef __opencl_c_atomic_scope_all_devices -#endif -#ifdef __undef___opencl_c_read_write_images -#undef __opencl_c_read_write_images -#endif -#ifdef __undef___opencl_c_integer_dot_product_input_4x8bit -#undef __opencl_c_integer_dot_product_input_4x8bit -#endif -#ifdef __undef___opencl_c_integer_dot_product_input_4x8bit_packed -#undef __opencl_c_integer_dot_product_input_4x8bit_packed -#endif - #if !defined(__opencl_c_generic_address_space) // Internal feature macro to provide named (global, local, private) address // space overloads for builtin functions that take a pointer argument. diff --git a/clang/lib/Headers/opencl-c.h b/clang/lib/Headers/opencl-c.h index f65b4b3..f9cdb2e 100644 --- a/clang/lib/Headers/opencl-c.h +++ b/clang/lib/Headers/opencl-c.h @@ -17525,6 +17525,13 @@ void __ovld __conv intel_sub_group_block_write_ui8( __global uint* p, uint #endif // defined(cl_intel_subgroups_char) || defined(cl_intel_subgroups_short) || // defined(cl_intel_subgroups_long) + +#if defined(cl_intel_subgroup_buffer_prefetch) +void __ovld __conv intel_sub_group_block_prefetch_ui(const __global uint *p); +void __ovld __conv intel_sub_group_block_prefetch_ui2(const __global uint *p); +void __ovld __conv intel_sub_group_block_prefetch_ui4(const __global uint *p); +void __ovld __conv intel_sub_group_block_prefetch_ui8(const __global uint *p); +#endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups #if defined(cl_intel_subgroups_short) @@ -17660,6 +17667,14 @@ void __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, u void __ovld __conv intel_sub_group_block_write_us4( __global ushort* p, ushort4 data ); void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, ushort8 data ); void __ovld __conv intel_sub_group_block_write_us16( __global ushort* p, ushort16 data ); + +#if defined(cl_intel_subgroup_buffer_prefetch) +void __ovld __conv intel_sub_group_block_prefetch_us(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us2(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us4(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us8(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us16(const __global ushort *p); +#endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups_short #if defined(cl_intel_subgroups_char) @@ -17795,6 +17810,14 @@ void __ovld __conv intel_sub_group_block_write_uc2( __global uchar* p, uc void __ovld __conv intel_sub_group_block_write_uc4( __global uchar* p, uchar4 data ); void __ovld __conv intel_sub_group_block_write_uc8( __global uchar* p, uchar8 data ); void __ovld __conv intel_sub_group_block_write_uc16( __global uchar* p, uchar16 data ); + +#if defined(cl_intel_subgroup_buffer_prefetch) +void __ovld __conv intel_sub_group_block_prefetch_uc(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc2(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc4(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc8(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc16(const __global uchar *p); +#endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups_char #if defined(cl_intel_subgroups_long) @@ -17839,6 +17862,13 @@ void __ovld __conv intel_sub_group_block_write_ul( __global ulong* p, ul void __ovld __conv intel_sub_group_block_write_ul2( __global ulong* p, ulong2 data ); void __ovld __conv intel_sub_group_block_write_ul4( __global ulong* p, ulong4 data ); void __ovld __conv intel_sub_group_block_write_ul8( __global ulong* p, ulong8 data); + +#if defined(cl_intel_subgroup_buffer_prefetch) +void __ovld __conv intel_sub_group_block_prefetch_ul(const __global ulong *p); +void __ovld __conv intel_sub_group_block_prefetch_ul2(const __global ulong *p); +void __ovld __conv intel_sub_group_block_prefetch_ul4(const __global ulong *p); +void __ovld __conv intel_sub_group_block_prefetch_ul8(const __global ulong *p); +#endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups_long #if defined(cl_intel_subgroup_local_block_io) diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h index 42bd343..a9a6544 100644 --- a/clang/lib/Headers/pmmintrin.h +++ b/clang/lib/Headers/pmmintrin.h @@ -60,9 +60,8 @@ _mm_lddqu_si128(__m128i_u const *__p) /// A 128-bit vector of [4 x float] containing the right source operand. /// \returns A 128-bit vector of [4 x float] containing the alternating sums and /// differences of both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_addsub_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_addsub_ps(__m128 __a, __m128 __b) { return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); } diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index fe6afdc..72a6439 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -231,10 +231,9 @@ _mm_div_ps(__m128 __a, __m128 __b) { /// used in the calculation. /// \returns A 128-bit vector of [4 x float] containing the square root of the /// value in the low-order bits of the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_sqrt_ss(__m128 __a) -{ - return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); +static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) { + __a[0] = __builtin_elementwise_sqrt(__a[0]); + return __a; } /// Calculates the square roots of the values stored in a 128-bit vector @@ -248,10 +247,8 @@ _mm_sqrt_ss(__m128 __a) /// A 128-bit vector of [4 x float]. /// \returns A 128-bit vector of [4 x float] containing the square roots of the /// values in the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_sqrt_ps(__m128 __a) -{ - return __builtin_ia32_sqrtps((__v4sf)__a); +static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) { + return __builtin_elementwise_sqrt(__a); } /// Calculates the approximate reciprocal of the value stored in the |
