aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkonglin1 <lingling.kong@intel.com>2022-10-31 14:04:08 +0800
committerkonglin1 <lingling.kong@intel.com>2022-10-31 14:04:12 +0800
commit87235f1e5c740de9c6f72a5dd7d7eb9cb7df2e1d (patch)
tree04704741c19b5506500c901c12f632e64107c37b
parent6913cad2a38bc406b137b06d579b650f6fe9a2e6 (diff)
downloadgcc-87235f1e5c740de9c6f72a5dd7d7eb9cb7df2e1d.zip
gcc-87235f1e5c740de9c6f72a5dd7d7eb9cb7df2e1d.tar.gz
gcc-87235f1e5c740de9c6f72a5dd7d7eb9cb7df2e1d.tar.bz2
i386:: using __bf16 for AVX512BF16 intrinsics
gcc/ChangeLog: * config/i386/avx512bf16intrin.h (__attribute__): Change short to bf16. (_mm_cvtsbh_ss): Ditto. (_mm512_cvtne2ps_pbh): Ditto. (_mm512_mask_cvtne2ps_pbh): Ditto. (_mm512_maskz_cvtne2ps_pbh): Ditto. * config/i386/avx512bf16vlintrin.h (__attribute__): Ditto. (_mm256_cvtne2ps_pbh): Ditto. (_mm256_mask_cvtne2ps_pbh): Ditto. (_mm256_maskz_cvtne2ps_pbh): Ditto. (_mm_cvtne2ps_pbh): Ditto. (_mm_mask_cvtne2ps_pbh): Ditto. (_mm_maskz_cvtne2ps_pbh): Ditto. (_mm_cvtness_sbh): Ditto. * config/i386/i386-builtin-types.def (V8BF): Add new DEF_VECTOR_TYPE for BFmode. (V16BF): Ditto. (V32BF): Ditto. * config/i386/i386-builtin.def (BDESC): Fixed builtins. * config/i386/i386-expand.cc (ix86_expand_args_builtin): Changed avx512bf16 ix86_builtin_func_type included HI to BF. * config/i386/immintrin.h: Add SSE2 depend for avx512bf16. * config/i386/sse.md (TARGET_AVX512VL): Changed HI vector to BF vector. (avx512f_cvtneps2bf16_v4sf): New define_expand. (*avx512f_cvtneps2bf16_v4sf): New define_insn. (avx512f_cvtneps2bf16_v4sf_maskz):Ditto. (avx512f_cvtneps2bf16_v4sf_mask): Ditto. (avx512f_cvtneps2bf16_v4sf_mask_1): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Add fpmath option. * gcc.target/i386/avx512bf16-vdpbf16ps-2.c: Fixed scan-assembler. * gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Add x/y suffix for vcvtneps2bf16. * gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c: Ditto.
-rw-r--r--gcc/config/i386/avx512bf16intrin.h12
-rw-r--r--gcc/config/i386/avx512bf16vlintrin.h29
-rw-r--r--gcc/config/i386/i386-builtin-types.def51
-rw-r--r--gcc/config/i386/i386-builtin.def54
-rw-r--r--gcc/config/i386/i386-expand.cc48
-rw-r--r--gcc/config/i386/immintrin.h2
-rw-r--r--gcc/config/i386/sse.md101
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c12
11 files changed, 189 insertions, 126 deletions
diff --git a/gcc/config/i386/avx512bf16intrin.h b/gcc/config/i386/avx512bf16intrin.h
index b6e9dda..ea1d012 100644
--- a/gcc/config/i386/avx512bf16intrin.h
+++ b/gcc/config/i386/avx512bf16intrin.h
@@ -35,16 +35,16 @@
#endif /* __AVX512BF16__ */
/* Internal data types for implementing the intrinsics. */
-typedef short __v32bh __attribute__ ((__vector_size__ (64)));
+typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
-typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
/* Convert One BF16 Data to One Single Float Data. */
extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsbh_ss (__bfloat16 __A)
+_mm_cvtsbh_ss (__bf16 __A)
{
union{ float a; unsigned int b;} __tmp;
__tmp.b = ((unsigned int)(__A)) << 16;
@@ -57,21 +57,21 @@ extern __inline __m512bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
{
- return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
}
extern __inline __m512bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
{
- return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
}
extern __inline __m512bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
{
- return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
}
/* vcvtneps2bf16 */
diff --git a/gcc/config/i386/avx512bf16vlintrin.h b/gcc/config/i386/avx512bf16vlintrin.h
index 969335f..56c28f1 100644
--- a/gcc/config/i386/avx512bf16vlintrin.h
+++ b/gcc/config/i386/avx512bf16vlintrin.h
@@ -35,57 +35,58 @@
#endif /* __AVX512BF16__ */
/* Internal data types for implementing the intrinsics. */
-typedef short __v16bh __attribute__ ((__vector_size__ (32)));
-typedef short __v8bh __attribute__ ((__vector_size__ (16)));
+typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
+typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
-typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
-typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+
+typedef __bf16 __bfloat16;
-typedef unsigned short __bfloat16;
/* vcvtne2ps2bf16 */
extern __inline __m256bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
{
- return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
}
extern __inline __m256bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
{
- return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
}
extern __inline __m256bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
{
- return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
}
extern __inline __m128bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
{
- return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
}
extern __inline __m128bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
{
- return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
}
extern __inline __m128bh
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
{
- return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
}
/* vcvtneps2bf16 */
@@ -176,13 +177,13 @@ _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
}
-extern __inline __bfloat16
+extern __inline __bf16
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtness_sbh (float __A)
{
__v4sf __V = {__A, 0, 0, 0};
- __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
- (__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
+ __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
+ (__v8bf)_mm_undefined_si128 (), (__mmask8)-1);
return __R[0];
}
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 63a360b..aedae2d 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -87,6 +87,7 @@ DEF_VECTOR_TYPE (V8QI, QI)
DEF_VECTOR_TYPE (V2DF, DOUBLE)
DEF_VECTOR_TYPE (V4SF, FLOAT)
DEF_VECTOR_TYPE (V8HF, FLOAT16)
+DEF_VECTOR_TYPE (V8BF, BFLOAT16)
DEF_VECTOR_TYPE (V2DI, DI)
DEF_VECTOR_TYPE (V4SI, SI)
DEF_VECTOR_TYPE (V8HI, HI)
@@ -100,6 +101,7 @@ DEF_VECTOR_TYPE (V16UQI, UQI, V16QI)
DEF_VECTOR_TYPE (V4DF, DOUBLE)
DEF_VECTOR_TYPE (V8SF, FLOAT)
DEF_VECTOR_TYPE (V16HF, FLOAT16)
+DEF_VECTOR_TYPE (V16BF, BFLOAT16)
DEF_VECTOR_TYPE (V4DI, DI)
DEF_VECTOR_TYPE (V8SI, SI)
DEF_VECTOR_TYPE (V16HI, HI)
@@ -111,6 +113,7 @@ DEF_VECTOR_TYPE (V16UHI, UHI, V16HI)
# AVX512F vectors
DEF_VECTOR_TYPE (V32SF, FLOAT)
DEF_VECTOR_TYPE (V32HF, FLOAT16)
+DEF_VECTOR_TYPE (V32BF, BFLOAT16)
DEF_VECTOR_TYPE (V16SF, FLOAT)
DEF_VECTOR_TYPE (V8DF, DOUBLE)
DEF_VECTOR_TYPE (V8DI, DI)
@@ -1273,30 +1276,30 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI)
DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI)
# BF16 builtins
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF)
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, V32HI, USI)
-DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, USI)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, V16HI, UHI)
-DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, UHI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, UQI)
-DEF_FUNCTION_TYPE (V16HI, V16SF)
-DEF_FUNCTION_TYPE (V16HI, V16SF, V16HI, UHI)
-DEF_FUNCTION_TYPE (V16HI, V16SF, UHI)
-DEF_FUNCTION_TYPE (V8HI, V8SF)
-DEF_FUNCTION_TYPE (V8HI, V8SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V8SF, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF)
-DEF_FUNCTION_TYPE (V8HI, V4SF, V8HI, UQI)
-DEF_FUNCTION_TYPE (V8HI, V4SF, UQI)
-DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI)
-DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI, UHI)
-DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI)
-DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI, UQI)
-DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI)
-DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI, UQI)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, V32BF, USI)
+DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, USI)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, V16BF, UHI)
+DEF_FUNCTION_TYPE (V16BF, V8SF, V8SF, UHI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V16BF, V16SF)
+DEF_FUNCTION_TYPE (V16BF, V16SF, V16BF, UHI)
+DEF_FUNCTION_TYPE (V16BF, V16SF, UHI)
+DEF_FUNCTION_TYPE (V8BF, V8SF)
+DEF_FUNCTION_TYPE (V8BF, V8SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V8SF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF)
+DEF_FUNCTION_TYPE (V8BF, V4SF, V8BF, UQI)
+DEF_FUNCTION_TYPE (V8BF, V4SF, UQI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V32BF, V32BF, UHI)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V16BF, V16BF, UQI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V8BF, V8BF, UQI)
# KEYLOCKER builtins
DEF_FUNCTION_TYPE (UINT, UINT, V2DI, V2DI, PVOID)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index e35306e..5802e20 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2779,33 +2779,33 @@ BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vae
BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
/* BF16 */
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_mask, "__builtin_ia32_cvtne2ps2bf16_v32hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_V32HI_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v32hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_USI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi, "__builtin_ia32_cvtne2ps2bf16_v16hi", IX86_BUILTIN_CVTNE2PS2HI16_V16HI, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_mask, "__builtin_ia32_cvtne2ps2bf16_v16hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_V16HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v16hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi, "__builtin_ia32_cvtne2ps2bf16_v8hi", IX86_BUILTIN_CVTNE2PS2HI16_V8HI, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_mask, "__builtin_ia32_cvtne2ps2bf16_v8hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v8hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2HI16_V16SF, UNKNOWN, (int) V16HI_FTYPE_V16SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V16SF_MASK, UNKNOWN, (int) V16HI_FTYPE_V16SF_V16HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16SF_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16SF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2HI16_V8SF, UNKNOWN, (int) V8HI_FTYPE_V8SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V8SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V8SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2HI16_V4SF, UNKNOWN, (int) V8HI_FTYPE_V4SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V4SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V4SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPHI16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPHI16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPHI16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPHI16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPHI16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPHI16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPHI16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
-BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf, "__builtin_ia32_cvtne2ps2bf16_v32bf", IX86_BUILTIN_CVTNE2PS2BF16_V32BF, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_mask, "__builtin_ia32_cvtne2ps2bf16_v32bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASK, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_V32BF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v32bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V32BF_MASKZ, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf, "__builtin_ia32_cvtne2ps2bf16_v16bf", IX86_BUILTIN_CVTNE2PS2BF16_V16BF, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_mask, "__builtin_ia32_cvtne2ps2bf16_v16bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASK, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_V16BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v16bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16BF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V8SF_V8SF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf, "__builtin_ia32_cvtne2ps2bf16_v8bf", IX86_BUILTIN_CVTNE2PS2BF16_V8BF, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_mask, "__builtin_ia32_cvtne2ps2bf16_v8bf_mask", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8bf_maskz, "__builtin_ia32_cvtne2ps2bf16_v8bf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8BF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_V4SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2BF16_V16SF, UNKNOWN, (int) V16BF_FTYPE_V16SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V16SF_MASK, UNKNOWN, (int) V16BF_FTYPE_V16SF_V16BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V16SF_MASKZ, UNKNOWN, (int) V16BF_FTYPE_V16SF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2BF16_V8SF, UNKNOWN, (int) V8BF_FTYPE_V8SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V8SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V8SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V8SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V8SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2BF16_V4SF, UNKNOWN, (int) V8BF_FTYPE_V4SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2BF16_V4SF_MASK, UNKNOWN, (int) V8BF_FTYPE_V4SF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2BF16_V4SF_MASKZ, UNKNOWN, (int) V8BF_FTYPE_V4SF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPBF16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPBF16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPBF16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32BF_V32BF_UHI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPBF16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPBF16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPBF16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16BF_V16BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPBF16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPBF16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
/* AVX512FP16. */
BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5d9e5a1..8e1ef0b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10462,9 +10462,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V8DF_FTYPE_V2DF:
case V8DF_FTYPE_V8DF:
case V4DI_FTYPE_V4DI:
- case V16HI_FTYPE_V16SF:
- case V8HI_FTYPE_V8SF:
- case V8HI_FTYPE_V4SF:
+ case V16BF_FTYPE_V16SF:
+ case V8BF_FTYPE_V8SF:
+ case V8BF_FTYPE_V4SF:
nargs = 1;
break;
case V4SF_FTYPE_V4SF_VEC_MERGE:
@@ -10592,12 +10592,12 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case USI_FTYPE_USI_USI:
case UDI_FTYPE_UDI_UDI:
case V16SI_FTYPE_V8DF_V8DF:
- case V32HI_FTYPE_V16SF_V16SF:
- case V16HI_FTYPE_V8SF_V8SF:
- case V8HI_FTYPE_V4SF_V4SF:
- case V16HI_FTYPE_V16SF_UHI:
- case V8HI_FTYPE_V8SF_UQI:
- case V8HI_FTYPE_V4SF_UQI:
+ case V32BF_FTYPE_V16SF_V16SF:
+ case V16BF_FTYPE_V8SF_V8SF:
+ case V8BF_FTYPE_V4SF_V4SF:
+ case V16BF_FTYPE_V16SF_UHI:
+ case V8BF_FTYPE_V8SF_UQI:
+ case V8BF_FTYPE_V4SF_UQI:
nargs = 2;
break;
case V2DI_FTYPE_V2DI_INT_CONVERT:
@@ -10803,15 +10803,15 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V16HI_FTYPE_V16HI_V16HI_V16HI:
case V8SI_FTYPE_V8SI_V8SI_V8SI:
case V8HI_FTYPE_V8HI_V8HI_V8HI:
- case V32HI_FTYPE_V16SF_V16SF_USI:
- case V16HI_FTYPE_V8SF_V8SF_UHI:
- case V8HI_FTYPE_V4SF_V4SF_UQI:
- case V16HI_FTYPE_V16SF_V16HI_UHI:
- case V8HI_FTYPE_V8SF_V8HI_UQI:
- case V8HI_FTYPE_V4SF_V8HI_UQI:
- case V16SF_FTYPE_V16SF_V32HI_V32HI:
- case V8SF_FTYPE_V8SF_V16HI_V16HI:
- case V4SF_FTYPE_V4SF_V8HI_V8HI:
+ case V32BF_FTYPE_V16SF_V16SF_USI:
+ case V16BF_FTYPE_V8SF_V8SF_UHI:
+ case V8BF_FTYPE_V4SF_V4SF_UQI:
+ case V16BF_FTYPE_V16SF_V16BF_UHI:
+ case V8BF_FTYPE_V8SF_V8BF_UQI:
+ case V8BF_FTYPE_V4SF_V8BF_UQI:
+ case V16SF_FTYPE_V16SF_V32BF_V32BF:
+ case V8SF_FTYPE_V8SF_V16BF_V16BF:
+ case V4SF_FTYPE_V4SF_V8BF_V8BF:
nargs = 3;
break;
case V32QI_FTYPE_V32QI_V32QI_INT:
@@ -10958,9 +10958,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
- case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
- case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
- case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
+ case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
+ case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
+ case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
nargs = 4;
break;
case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
@@ -10998,9 +10998,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
break;
case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
- case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
- case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
- case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
+ case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
+ case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
+ case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
nargs = 4;
break;
case UQI_FTYPE_V8DI_V8DI_INT_UQI:
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index ddea249..c62d50f 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -118,9 +118,11 @@
#include <vpclmulqdqintrin.h>
+#ifdef __SSE2__
#include <avx512bf16vlintrin.h>
#include <avx512bf16intrin.h>
+#endif
#include <amxtileintrin.h>
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f4b5506..fba81a9 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -187,8 +187,6 @@
UNSPEC_VP2INTERSECT
;; For AVX512BF16 support
- UNSPEC_VCVTNE2PS2BF16
- UNSPEC_VCVTNEPS2BF16
UNSPEC_VDPBF16PS
;; For AVX512FP16 suppport
@@ -28918,41 +28916,101 @@
"vp2intersectd\t{%2, %1, %0|%0, %1, %2}"
[(set_attr ("prefix") ("evex"))])
-(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
+(define_mode_iterator VF_AVX512BF16VL
+ [V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
;; Converting from BF to SF
(define_mode_attr bf16_cvt_2sf
- [(V32HI "V16SF") (V16HI "V8SF") (V8HI "V4SF")])
+ [(V32BF "V16SF") (V16BF "V8SF") (V8BF "V4SF")])
;; Converting from SF to BF
(define_mode_attr sf_cvt_bf16
- [(V4SF "V8HI") (V8SF "V8HI") (V16SF "V16HI")])
+ [(V8SF "V8BF") (V16SF "V16BF")])
;; Mapping from BF to SF
(define_mode_attr sf_bf16
- [(V4SF "V8HI") (V8SF "V16HI") (V16SF "V32HI")])
+ [(V4SF "V8BF") (V8SF "V16BF") (V16SF "V32BF")])
(define_expand "avx512f_cvtne2ps2bf16_<mode>_maskz"
- [(match_operand:BF16 0 "register_operand")
+ [(match_operand:VF_AVX512BF16VL 0 "register_operand")
(match_operand:<bf16_cvt_2sf> 1 "register_operand")
- (match_operand:<bf16_cvt_2sf> 2 "register_operand")
+ (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand")
(match_operand:<avx512fmaskmode> 3 "register_operand")]
"TARGET_AVX512BF16"
{
- emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[1],
- operands[2], CONST0_RTX(<MODE>mode), operands[3]));
+ emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[2],
+ operands[1], CONST0_RTX(<MODE>mode), operands[3]));
DONE;
})
(define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>"
- [(set (match_operand:BF16 0 "register_operand" "=v")
- (unspec:BF16
- [(match_operand:<bf16_cvt_2sf> 1 "register_operand" "v")
- (match_operand:<bf16_cvt_2sf> 2 "register_operand" "v")]
- UNSPEC_VCVTNE2PS2BF16))]
+ [(set (match_operand:VF_AVX512BF16VL 0 "register_operand" "=v")
+ (vec_concat:VF_AVX512BF16VL
+ (float_truncate:<ssehalfvecmode>
+ (match_operand:<bf16_cvt_2sf> 2 "nonimmediate_operand" "vm"))
+ (float_truncate:<ssehalfvecmode>
+ (match_operand:<bf16_cvt_2sf> 1 "register_operand" "v"))))]
"TARGET_AVX512BF16"
"vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
+(define_expand "avx512f_cvtneps2bf16_v4sf"
+ [(set (match_operand:V8BF 0 "register_operand")
+ (vec_concat:V8BF
+ (float_truncate:V4BF
+ (match_operand:V4SF 1 "nonimmediate_operand"))
+ (match_dup 2)))]
+ "TARGET_AVX512BF16 && TARGET_AVX512VL"
+ "operands[2] = CONST0_RTX (V4BFmode);")
+
+(define_insn "*avx512f_cvtneps2bf16_v4sf"
+ [(set (match_operand:V8BF 0 "register_operand" "=v")
+ (vec_concat:V8BF
+ (float_truncate:V4BF
+ (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
+ (match_operand:V4BF 2 "const0_operand")))]
+ "TARGET_AVX512BF16 && TARGET_AVX512VL"
+ "vcvtneps2bf16{x}\t{%1, %0|%0, %1}")
+
+(define_expand "avx512f_cvtneps2bf16_v4sf_maskz"
+ [(match_operand:V8BF 0 "register_operand")
+ (match_operand:V4SF 1 "nonimmediate_operand")
+ (match_operand:QI 2 "register_operand")]
+ "TARGET_AVX512BF16 && TARGET_AVX512VL"
+{
+ emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
+ CONST0_RTX(V8BFmode), operands[2], CONST0_RTX(V4BFmode)));
+ DONE;
+})
+
+(define_expand "avx512f_cvtneps2bf16_v4sf_mask"
+ [(match_operand:V8BF 0 "register_operand")
+ (match_operand:V4SF 1 "nonimmediate_operand")
+ (match_operand:V8BF 2 "nonimm_or_0_operand")
+ (match_operand:QI 3 "register_operand")]
+ "TARGET_AVX512BF16 && TARGET_AVX512VL"
+{
+ emit_insn (gen_avx512f_cvtneps2bf16_v4sf_mask_1(operands[0], operands[1],
+ operands[2], operands[3], CONST0_RTX(V4BFmode)));
+ DONE;
+})
+
+(define_insn "avx512f_cvtneps2bf16_v4sf_mask_1"
+ [(set (match_operand:V8BF 0 "register_operand" "=v")
+ (vec_concat:V8BF
+ (vec_merge:V4BF
+ (float_truncate:V4BF
+ (match_operand:V4SF 1 "nonimmediate_operand" "vm"))
+ (vec_select:V4BF
+ (match_operand:V8BF 2 "nonimm_or_0_operand" "0C")
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)]))
+ (match_operand:QI 3 "register_operand" "Yk"))
+ (match_operand:V4BF 4 "const0_operand")))]
+ "TARGET_AVX512BF16 && TARGET_AVX512VL"
+ "vcvtneps2bf16{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}")
+
+(define_mode_iterator VF1_AVX512_256 [V16SF (V8SF "TARGET_AVX512VL")])
+
(define_expand "avx512f_cvtneps2bf16_<mode>_maskz"
[(match_operand:<sf_cvt_bf16> 0 "register_operand")
- (match_operand:VF1_AVX512VL 1 "register_operand")
+ (match_operand:VF1_AVX512_256 1 "nonimmediate_operand")
(match_operand:<avx512fmaskmode> 2 "register_operand")]
"TARGET_AVX512BF16"
{
@@ -28963,11 +29021,10 @@
(define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
[(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
- (unspec:<sf_cvt_bf16>
- [(match_operand:VF1_AVX512VL 1 "register_operand" "v")]
- UNSPEC_VCVTNEPS2BF16))]
+ (float_truncate:<sf_cvt_bf16>
+ (match_operand:VF1_AVX512_256 1 "nonimmediate_operand" "vm")))]
"TARGET_AVX512BF16"
- "vcvtneps2bf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
+ "vcvtneps2bf16<qq2phsuff>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}")
(define_expand "avx512f_dpbf16ps_<mode>_maskz"
[(match_operand:VF1_AVX512VL 0 "register_operand")
@@ -28987,7 +29044,7 @@
(unspec:VF1_AVX512VL
[(match_operand:VF1_AVX512VL 1 "register_operand" "0")
(match_operand:<sf_bf16> 2 "register_operand" "v")
- (match_operand:<sf_bf16> 3 "register_operand" "v")]
+ (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
UNSPEC_VDPBF16PS))]
"TARGET_AVX512BF16"
"vdpbf16ps\t{%3, %2, %0<maskz_half_operand4>|%0<maskz_half_operand4>, %2, %3}")
@@ -28998,7 +29055,7 @@
(unspec:VF1_AVX512VL
[(match_operand:VF1_AVX512VL 1 "register_operand" "0")
(match_operand:<sf_bf16> 2 "register_operand" "v")
- (match_operand:<sf_bf16> 3 "register_operand" "v")]
+ (match_operand:<sf_bf16> 3 "nonimmediate_operand" "vm")]
UNSPEC_VDPBF16PS)
(match_dup 1)
(match_operand:<avx512fmaskhalfmode> 4 "register_operand" "Yk")))]
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
index 831abd3..8e929e6 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bf16 -O2" } */
-/* { dg-additional-options "-fno-PIE" { target ia32 } } */
+/* { dg-additional-options "-fno-PIE -mfpmath=sse" { target ia32 } } */
/* { dg-final { scan-assembler-times "sall\[ \\t\]+\[^\{\n\]*16" 1 } } */
/* { dg-final { scan-assembler-times "movl" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
index b64ad7b..02ebdd8 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bf16 -O2" } */
-/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
index 8f21b1b..b71addd 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
#include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
index 0969ae1..d3a9bdf 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c
@@ -1,11 +1,11 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16y\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16x\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
#include <immintrin.h>