diff options
Diffstat (limited to 'gcc')
35 files changed, 973 insertions, 15 deletions
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 5a6aed0..b128c31 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -1007,6 +1007,8 @@ get_available_features (struct __processor_model *cpu_model, set_feature (FEATURE_AMX_TF32); if (eax & bit_AMX_TRANSPOSE) set_feature (FEATURE_AMX_TRANSPOSE); + if (eax & bit_AMX_FP8) + set_feature (FEATURE_AMX_FP8); } } diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index a85b380..771bde2 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -134,6 +134,8 @@ along with GCC; see the file COPYING3. If not see (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_TF32) #define OPTION_MASK_ISA2_AMX_TRANSPOSE_SET \ (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_TRANSPOSE) +#define OPTION_MASK_ISA2_AMX_FP8_SET \ + (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_FP8) /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same as -msse4.2. */ @@ -297,7 +299,8 @@ along with GCC; see the file COPYING3. If not see (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_INT8_UNSET \ | OPTION_MASK_ISA2_AMX_BF16_UNSET | OPTION_MASK_ISA2_AMX_FP16_UNSET \ | OPTION_MASK_ISA2_AMX_COMPLEX_UNSET | OPTION_MASK_ISA2_AMX_AVX512_UNSET \ - | OPTION_MASK_ISA2_AMX_TF32_UNSET | OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET) + | OPTION_MASK_ISA2_AMX_TF32_UNSET | OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET \ + | OPTION_MASK_ISA2_AMX_FP8_UNSET) #define OPTION_MASK_ISA2_AMX_INT8_UNSET OPTION_MASK_ISA2_AMX_INT8 #define OPTION_MASK_ISA2_AMX_BF16_UNSET OPTION_MASK_ISA2_AMX_BF16 #define OPTION_MASK_ISA2_UINTR_UNSET OPTION_MASK_ISA2_UINTR @@ -330,6 +333,7 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA2_AMX_AVX512_UNSET OPTION_MASK_ISA2_AMX_AVX512 #define OPTION_MASK_ISA2_AMX_TF32_UNSET OPTION_MASK_ISA2_AMX_TF32 #define OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET OPTION_MASK_ISA2_AMX_TRANSPOSE +#define OPTION_MASK_ISA2_AMX_FP8_UNSET OPTION_MASK_ISA2_AMX_FP8 /* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same as -mno-sse4.1. */ @@ -1463,6 +1467,19 @@ ix86_handle_option (struct gcc_options *opts, } return true; + case OPT_mamx_fp8: + if (value) + { + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_FP8_SET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP8_SET; + } + else + { + opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_FP8_UNSET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP8_UNSET; + } + return true; + case OPT_mfma: if (value) { diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h index f5f8ba2..68d6325 100644 --- a/gcc/common/config/i386/i386-cpuinfo.h +++ b/gcc/common/config/i386/i386-cpuinfo.h @@ -272,6 +272,7 @@ enum processor_features FEATURE_AMX_AVX512, FEATURE_AMX_TF32, FEATURE_AMX_TRANSPOSE, + FEATURE_AMX_FP8, CPU_FEATURE_MAX }; diff --git a/gcc/common/config/i386/i386-isas.h b/gcc/common/config/i386/i386-isas.h index 28c8d07..ccd0d19 100644 --- a/gcc/common/config/i386/i386-isas.h +++ b/gcc/common/config/i386/i386-isas.h @@ -194,4 +194,5 @@ ISA_NAMES_TABLE_START ISA_NAMES_TABLE_ENTRY("amx-tf32", FEATURE_AMX_TF32, P_NONE, "-mamx-tf32") ISA_NAMES_TABLE_ENTRY("amx-transpose", FEATURE_AMX_TRANSPOSE, P_NONE, "-mamx-transpose") + ISA_NAMES_TABLE_ENTRY("amx-fp8", FEATURE_AMX_FP8, P_NONE, "-mamx-fp8") ISA_NAMES_TABLE_END diff --git a/gcc/config.gcc b/gcc/config.gcc index 5959714..6538c84 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -459,7 +459,7 @@ i[34567]86-*-* | x86_64-*-*) avx10_2satcvtintrin.h avx10_2-512satcvtintrin.h avx10_2minmaxintrin.h avx10_2-512minmaxintrin.h avx10_2copyintrin.h amxavx512intrin.h amxtf32intrin.h - amxtransposeintrin.h" + amxtransposeintrin.h amxfp8intrin.h" ;; ia64-*-*) extra_headers=ia64intrin.h diff --git a/gcc/config/i386/amxfp8intrin.h b/gcc/config/i386/amxfp8intrin.h new file mode 100644 index 0000000..7e6fca4 --- /dev/null +++ b/gcc/config/i386/amxfp8intrin.h @@ -0,0 +1,67 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use <amxfp8intrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AMXFP8INTRIN_H_INCLUDED +#define _AMXFP8INTRIN_H_INCLUDED + +#if defined(__x86_64__) +#define _tile_dpbf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdpbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dpbhf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdpbhf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbhf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dphbf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdphbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dphf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdphf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dpbf8ps(dst,src1,src2) \ + _tile_dpbf8ps_internal (dst,src1,src2) + +#define _tile_dpbhf8ps(dst,src1,src2) \ + _tile_dpbhf8ps_internal (dst,src1,src2) + +#define _tile_dphbf8ps(dst,src1,src2) \ + _tile_dphbf8ps_internal (dst,src1,src2) + +#define _tile_dphf8ps(dst,src1,src2) \ + _tile_dphf8ps_internal (dst,src1,src2) + +#endif + +#ifdef __DISABLE_AMX_FP8__ +#undef __DISABLE_AMX_FP8__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_FP8__ */ + +#endif /* _AMXFP8INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index d609be9..e3dda94 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -164,6 +164,7 @@ /* AMX sub leaf (%eax == 0x1e, %ecx == 1) */ /* %eax */ +#define bit_AMX_FP8 (1 << 4) #define bit_AMX_TRANSPOSE (1 << 5) #define bit_AMX_TF32 (1 << 6) #define bit_AMX_AVX512 (1 << 7) diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc index ee7497d..f18aa15 100644 --- a/gcc/config/i386/i386-c.cc +++ b/gcc/config/i386/i386-c.cc @@ -747,6 +747,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__AMX_TF32__"); if (isa_flag2 & OPTION_MASK_ISA2_AMX_TRANSPOSE) def_or_undef (parse_in, "__AMX_TRANSPOSE__"); + if (isa_flag2 & OPTION_MASK_ISA2_AMX_FP8) + def_or_undef (parse_in, "__AMX_FP8__"); if (TARGET_IAMCU) { def_or_undef (parse_in, "__iamcu"); diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def index ab8c2b8..1d1ef15 100644 --- a/gcc/config/i386/i386-isa.def +++ b/gcc/config/i386/i386-isa.def @@ -126,3 +126,4 @@ DEF_PTA(AVX10_2_512) DEF_PTA(AMX_AVX512) DEF_PTA(AMX_TF32) DEF_PTA(AMX_TRANSPOSE) +DEF_PTA(AMX_FP8) diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 03808c4..d3b26e2 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -266,7 +266,8 @@ static struct ix86_target_opts isa2_opts[] = { "-mavx10.2-512", OPTION_MASK_ISA2_AVX10_2_512 }, { "-mamx-avx512", OPTION_MASK_ISA2_AMX_AVX512 }, { "-mamx-tf32", OPTION_MASK_ISA2_AMX_TF32 }, - { "-mamx-transpose", OPTION_MASK_ISA2_AMX_TRANSPOSE } + { "-mamx-transpose", OPTION_MASK_ISA2_AMX_TRANSPOSE }, + { "-mamx-fp8", OPTION_MASK_ISA2_AMX_FP8 } }; static struct ix86_target_opts isa_opts[] = { @@ -1138,6 +1139,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], IX86_ATTR_ISA ("amx-avx512", OPT_mamx_avx512), IX86_ATTR_ISA ("amx-tf32", OPT_mamx_tf32), IX86_ATTR_ISA ("amx-transpose", OPT_mamx_transpose), + IX86_ATTR_ISA ("amx-fp8", OPT_mamx_fp8), /* enum options */ IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 4922cad..a833f47 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1402,3 +1402,7 @@ Support AMX-TF32 built-in functions and code generation. mamx-transpose Target Mask(ISA2_AMX_TRANSPOSE) Var(ix86_isa_flags2) Save Support AMX-TRANSPOSE built-in functions and code generation. + +mamx-fp8 +Target Mask(ISA2_AMX_FP8) Var(ix86_isa_flags2) Save +Support AMX-FP8 built-in functions and code generation. diff --git a/gcc/config/i386/i386.opt.urls b/gcc/config/i386/i386.opt.urls index cda5484..425ee27 100644 --- a/gcc/config/i386/i386.opt.urls +++ b/gcc/config/i386/i386.opt.urls @@ -622,3 +622,6 @@ UrlSuffix(gcc/x86-Options.html#index-mamx-tf32) mamx-transpose UrlSuffix(gcc/x86-Options.html#index-mamx-transpose) +mamx-fp8 +UrlSuffix(gcc/x86-Options.html#index-mamx-fp8) + diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index a870cc6..7e0f137 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -138,6 +138,8 @@ #include <amxtransposeintrin.h> +#include <amxfp8intrin.h> + #include <prfchwintrin.h> #include <keylockerintrin.h> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 73fc0c9..0d5b6d1 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -7559,6 +7559,11 @@ Enable/disable the generation of the AMX-TF32 instructions. @itemx no-amx-transpose Enable/disable the generation of the AMX-TRANSPOSE instructions. +@cindex @code{target("amx-fp8")} function attribute, x86 +@item amx-fp8 +@itemx no-amx-fp8 +Enable/disable the generation of the AMX-FP8 instructions. + @cindex @code{target("cld")} function attribute, x86 @item cld @itemx no-cld diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index b2bb9e2..87d3744 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -1482,7 +1482,7 @@ See RS/6000 and PowerPC Options. -mvpclmulqdq -mavx512bitalg -mmovdiri -mmovdir64b -mavx512vpopcntdq -mavx512vnni -mprfchw -mrdpid -mrdseed -msgx -mavx512vp2intersect -mserialize -mtsxldtrk --mamx-tile -mamx-int8 -mamx-bf16 -muintr -mhreset -mavxvnni +-mamx-tile -mamx-int8 -mamx-bf16 -muintr -mhreset -mavxvnni -mamx-fp8 -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mapxf -musermsr -mavx10.1 -mavx10.1-256 -mavx10.1-512 -mevex512 -mavx10.2 -mavx10.2-256 @@ -35683,6 +35683,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @need 200 @opindex mamx-transpose @itemx -mamx-transpose +@need 200 +@itemx -mamx-fp8 +@opindex mamx-fp8 These switches enable the use of instructions in the MMX, SSE, AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA, AES, PCLMUL, CLFLUSHOPT, CLWB, FSGSBASE, PTWRITE, RDRND, F16C, FMA, PCONFIG, @@ -35693,9 +35696,9 @@ WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16, ENQCMD, AVX512VPOPCNTDQ, AVX512VNNI, SERIALIZE, UINTR, HRESET, AMXTILE, AMXINT8, AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16, AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD, AMX-FP16, PREFETCHI, RAOINT, AMX-COMPLEX, AVXVNNIINT16, SM3, SHA512, -SM4, APX_F, USER_MSR, AVX10.1, AVX10.2, AMX-AVX512, AMX-TF32, AMX-TRANSPOSE or -CLDEMOTE extended instruction sets. Each has a corresponding @option{-mno-} -option to disable use of these instructions. +SM4, APX_F, USER_MSR, AVX10.1, AVX10.2, AMX-AVX512, AMX-TF32, AMX-TRANSPOSE, +AMX-FP8 or CLDEMOTE extended instruction sets. Each has a corresponding +@option{-mno-} option to disable use of these instructions. These extensions are also available as built-in functions: see @ref{x86 Built-in Functions}, for details of the functions enabled and diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index 09831b6..b627c3c 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -2659,6 +2659,9 @@ Target supports the execution of @code{amx-tf32} instructions. @item amx_transpose Target supports the execution of @code{amx-transpose} instructions. +@item amx_fp8 +Target supports the execution of @code{amx-fp8} instructions. + @item cell_hw Test system can execute AltiVec and Cell PPU instructions. diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C index 05461ed..8d348d9 100644 --- a/gcc/testsuite/g++.dg/other/i386-2.C +++ b/gcc/testsuite/g++.dg/other/i386-2.C @@ -1,5 +1,5 @@ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */ +/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mamx-fp8" } */ /* { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib } } */ /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h, diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C index 0e5df7a..accb881 100644 --- a/gcc/testsuite/g++.dg/other/i386-3.C +++ b/gcc/testsuite/g++.dg/other/i386-3.C @@ -1,5 +1,5 @@ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */ +/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mamx-fp8" } */ /* { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib } } */ /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h, diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h index 28a71cb..b765f30 100644 --- a/gcc/testsuite/gcc.target/i386/amx-check.h +++ b/gcc/testsuite/gcc.target/i386/amx-check.h @@ -263,6 +263,9 @@ main () #ifdef AMX_TRANSPOSE && __builtin_cpu_supports ("amx-transpose") #endif +#ifdef AMX_FP8 + && __builtin_cpu_supports ("amx-fp8") +#endif #ifdef __linux__ && request_perm_xtile_data () #endif diff --git a/gcc/testsuite/gcc.target/i386/amx-helper.h b/gcc/testsuite/gcc.target/i386/amx-helper.h index b505581..b29c7ce 100644 --- a/gcc/testsuite/gcc.target/i386/amx-helper.h +++ b/gcc/testsuite/gcc.target/i386/amx-helper.h @@ -116,6 +116,17 @@ void init_fp32_max_tile_buffer (uint8_t *buf) ptr[i * 16 + j] = 2.5f * i + 1.25f * j; } +/* Init tile fp32 buffer with zero */ +void init_fp32_max_tile_zero_buffer (uint8_t *buf) +{ + int i, j; + float* ptr = (float *) buf; + + for (i = 0; i < 16; i++) + for (j = 0; j < 16; j++) + ptr[i * 16 + j] = 0.0f; +} + /* Init tile buffer with int32 */ void init_int32_max_tile_buffer (uint8_t *buf) { @@ -127,6 +138,53 @@ void init_int32_max_tile_buffer (uint8_t *buf) ptr[i * 16 + j] = (uint32_t) (3 * j - 16 * i); } +void +init_fp8_max_tile_buffer (uint8_t *buf) +{ + int i, j; + + for (i = 0; i < 16; i++) + for (j = 0; j < 64; j++) + { + int idx = i * 64 + j; + + /* Positive Infinity (S11111.00) */ + if (idx % 128 == 0) + buf[idx] = 0x7C; + + /* Negative Infinity (S11111.00 with sign bit set) */ + else if (idx % 128 == 1) + buf[idx] = 0xFC; + + /* Positive NaN (S11111.01) */ + else if (idx % 128 == 2) + buf[idx] = 0x7D; + + /* Negative NaN (S11111.01 with sign bit set) */ + else if (idx % 128 == 3) + buf[idx] = 0xFD; + + /* insert Positive NaN (S11111.10) */ + else if (idx % 128 == 4) + buf[idx] = 0x7E; + + /* Negative NaN (S11111.10 with sign bit set) */ + else if (idx % 128 == 5) + buf[idx] = 0xFE; + + /* Positive NaN (S11111.11) */ + else if (idx % 128 == 6) + buf[idx] = 0x7F; + + /* Negative NaN (S11111.11 with sign bit set) */ + else if (idx % 128 == 7) + buf[idx] = 0xFF; + + else + buf[idx] = (uint8_t) ((idx * 251) & 0xFF); + } +} + #define COMPARE_ZMM(A, B) \ for (int j = 0; j < 16; j++) \ { \ diff --git a/gcc/testsuite/gcc.target/i386/amxfp8-asmatt-1.c b/gcc/testsuite/gcc.target/i386/amxfp8-asmatt-1.c new file mode 100644 index 0000000..8bc261c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxfp8-asmatt-1.c @@ -0,0 +1,20 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mamx-fp8" } */ +/* { dg-final { scan-assembler "tdpbf8ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +/* { dg-final { scan-assembler "tdpbhf8ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +/* { dg-final { scan-assembler "tdphbf8ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +/* { dg-final { scan-assembler "tdphf8ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ + +#include <immintrin.h> + +#define TMM1 1 +#define TMM2 2 +#define TMM3 3 + +void TEST () +{ + _tile_dpbf8ps (TMM1, TMM2, TMM3); + _tile_dpbhf8ps (TMM1, TMM2, TMM3); + _tile_dphbf8ps (TMM1, TMM2, TMM3); + _tile_dphf8ps (TMM1, TMM2, TMM3); +} diff --git a/gcc/testsuite/gcc.target/i386/amxfp8-asmintel-1.c b/gcc/testsuite/gcc.target/i386/amxfp8-asmintel-1.c new file mode 100644 index 0000000..f83b7a9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxfp8-asmintel-1.c @@ -0,0 +1,21 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-require-effective-target masm_intel } */ +/* { dg-options "-O2 -mamx-fp16 -masm=intel" } */ +/* { dg-final { scan-assembler "tdpbf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */ +/* { dg-final { scan-assembler "tdpbhf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */ +/* { dg-final { scan-assembler "tdphbf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */ +/* { dg-final { scan-assembler "tdphf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */ + +#include <immintrin.h> + +#define TMM1 1 +#define TMM2 2 +#define TMM3 3 + +void TEST () +{ + _tile_dpbf8ps (TMM1, TMM2, TMM3); + _tile_dpbhf8ps (TMM1, TMM2, TMM3); + _tile_dphbf8ps (TMM1, TMM2, TMM3); + _tile_dphf8ps (TMM1, TMM2, TMM3); +} diff --git a/gcc/testsuite/gcc.target/i386/amxfp8-dpbf8ps-2.c b/gcc/testsuite/gcc.target/i386/amxfp8-dpbf8ps-2.c new file mode 100644 index 0000000..e167659 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxfp8-dpbf8ps-2.c @@ -0,0 +1,120 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_fp8 } */ +/* { dg-options "-O2 -mamx-fp8" } */ + +#define AMX_FP8 +#define DO_TEST test_amx_fp8_dpbf8ps +void test_amx_fp8_dpbf8ps (); + +#include "amx-helper.h" +#include "fp8-helper.h" +#include "fp-emulation.h" + +void +calc_matrix_dpbf8ps (__tile *dst, __tile *src1, __tile *src2) +{ + unsigned char *src1_buf = (unsigned char *)src1->buf; + unsigned char *src2_buf = (unsigned char *)src2->buf; + float *dst_buf = (float *)dst->buf; + + int M = src1->rows; + int N = src1->colsb / 4; + int K = src2->colsb / 4; + + int* valueState_a = (int*)malloc(sizeof(int)); + int* valueState_b = (int*)malloc(sizeof(int)); + int* valueState = (int *)malloc(M * K * sizeof(int)); + __int128_t *temp = (__int128_t *)malloc(M * K * sizeof(__int128_t)); + + for (int i = 0; i < M; i++) + { + for (int j = 0; j < N; j++) + for (int k = 0; k < K; k++) + { + int64_t s1e0 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 0], 1, valueState_a); + int64_t s2e0 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 0], 1, valueState_b); + int valueState_e0 = state_handler(*valueState_a, *valueState_b, s1e0, s2e0, '*'); + + int64_t s1e1 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 1], 1, valueState_a); + int64_t s2e1 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 1], 1, valueState_b); + int valueState_e1 = state_handler(*valueState_a, *valueState_b, s1e1, s2e1, '*'); + + int64_t s1e2 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 2], 1, valueState_a); + int64_t s2e2 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 2], 1, valueState_b); + int valueState_e2 = state_handler(*valueState_a, *valueState_b, s1e2, s2e2, '*'); + + int64_t s1e3 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 3], 1, valueState_a); + int64_t s2e3 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 3], 1, valueState_b); + int valueState_e3 = state_handler(*valueState_a, *valueState_b, s1e3, s2e3, '*'); + + valueState[i * K + k] = state_handler( + state_handler( + state_handler( + state_handler( + valueState_e0, valueState_e1, (__int128_t)s1e0 * s2e0, s1e1 * s2e1, '+'), + valueState_e2, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1, s1e2 * s2e2, '+'), + valueState_e3, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2, s1e3 * s2e3, '+'), + valueState[i * K + k], (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2 + s1e3 * s2e3, temp[i * K + k], '+'); + + temp[i * K + k] += (__int128_t)s1e0 * (__int128_t)s2e0 + + (__int128_t)s1e1 * (__int128_t)s2e1 + + (__int128_t)s1e2 * (__int128_t)s2e2 + + (__int128_t)s1e3 * (__int128_t)s2e3; + } + } + + for (int i = 0; i < M; i++) + for (int k = 0; k < K; k++) + { + float tmp_float = shift_int128_to_fp32(temp[i * K + k], 1, 1); + if (isnan(dst_buf[i * K + k]) || isnan(tmp_float)) + dst_buf[i * K + k] = -nanf(""); + else switch(valueState[i * K + k]) + { + case -3: + dst_buf[i * K + k] = -INFINITY; + break; + case 3: + dst_buf[i * K + k] = 1.0 / 0.0; + break; + case 2: + case -2: + dst_buf[i * K + k] = -nanf(""); + break; + case -1: + case 0: + case 1: + default: + dst_buf[i * K + k] += tmp_float; + break; + } + } + + free(valueState_a); + free(valueState_b); + free(valueState); + free(temp); +} + +void test_amx_fp8_dpbf8ps () +{ + __tilecfg_u cfg; + __tile dst, dst_ref, src1, src2; + uint8_t tmp_dst_zero_buf[1024], tmp_src_buf[1024]; + + init_fp32_max_tile_zero_buffer (tmp_dst_zero_buf); + init_fp8_max_tile_buffer(tmp_src_buf); + + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf); + init_tile_reg_and_src_with_buffer (2, src1, tmp_src_buf); + init_tile_reg_and_src_with_buffer (3, src2, tmp_src_buf); + + calc_matrix_dpbf8ps (&dst, &src1, &src2); + + _tile_dpbf8ps (1, 2, 3); + _tile_stored (1, dst_ref.buf, _STRIDE); + + if (!check_float_tile_register (&dst_ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/amxfp8-dpbhf8ps-2.c b/gcc/testsuite/gcc.target/i386/amxfp8-dpbhf8ps-2.c new file mode 100644 index 0000000..ce950c3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxfp8-dpbhf8ps-2.c @@ -0,0 +1,120 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_fp8 } */ +/* { dg-options "-O2 -mamx-fp8" } */ + +#define AMX_FP8 +#define DO_TEST test_amx_fp8_dpbhf8ps +void test_amx_fp8_dpbhf8ps (); + +#include "amx-helper.h" +#include "fp8-helper.h" +#include "fp-emulation.h" + +void +calc_matrix_dpbhf8ps (__tile *dst, __tile *src1, __tile *src2) +{ + unsigned char *src1_buf = (unsigned char *)src1->buf; + unsigned char *src2_buf = (unsigned char *)src2->buf; + float *dst_buf = (float *)dst->buf; + + int M = src1->rows; + int N = src1->colsb / 4; + int K = src2->colsb / 4; + + int* valueState_a = (int*)malloc(sizeof(int)); + int* valueState_b = (int*)malloc(sizeof(int)); + int* valueState = (int *)malloc(M * K * sizeof(int)); + __int128_t *temp = (__int128_t *)malloc(M * K * sizeof(__int128_t)); + + for (int i = 0; i < M; i++) + { + for (int j = 0; j < N; j++) + for (int k = 0; k < K; k++) + { + int64_t s1e0 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 0], 1, valueState_a); + int64_t s2e0 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 0], 0, valueState_b); + int valueState_e0 = state_handler(*valueState_a, *valueState_b, s1e0, s2e0, '*'); + + int64_t s1e1 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 1], 1, valueState_a); + int64_t s2e1 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 1], 0, valueState_b); + int valueState_e1 = state_handler(*valueState_a, *valueState_b, s1e1, s2e1, '*'); + + int64_t s1e2 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 2], 1, valueState_a); + int64_t s2e2 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 2], 0, valueState_b); + int valueState_e2 = state_handler(*valueState_a, *valueState_b, s1e2, s2e2, '*'); + + int64_t s1e3 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 3], 1, valueState_a); + int64_t s2e3 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 3], 0, valueState_b); + int valueState_e3 = state_handler(*valueState_a, *valueState_b, s1e3, s2e3, '*'); + + valueState[i * K + k] = state_handler( + state_handler( + state_handler( + state_handler( + valueState_e0, valueState_e1, (__int128_t)s1e0 * s2e0, s1e1 * s2e1, '+'), + valueState_e2, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1, s1e2 * s2e2, '+'), + valueState_e3, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2, s1e3 * s2e3, '+'), + valueState[i * K + k], (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2 + s1e3 * s2e3, temp[i * K + k], '+'); + + temp[i * K + k] += (__int128_t)s1e0 * (__int128_t)s2e0 + + (__int128_t)s1e1 * (__int128_t)s2e1 + + (__int128_t)s1e2 * (__int128_t)s2e2 + + (__int128_t)s1e3 * (__int128_t)s2e3; + } + } + + for (int i = 0; i < M; i++) + for (int k = 0; k < K; k++) + { + float tmp_float = shift_int128_to_fp32(temp[i * K + k], 1, 0); + if (isnan(dst_buf[i * K + k]) || isnan(tmp_float)) + dst_buf[i * K + k] = -nanf(""); + else switch(valueState[i * K + k]) + { + case -3: + dst_buf[i * K + k] = -INFINITY; + break; + case 3: + dst_buf[i * K + k] = 1.0 / 0.0; + break; + case 2: + case -2: + dst_buf[i * K + k] = -nanf(""); + break; + case -1: + case 0: + case 1: + default: + dst_buf[i * K + k] += tmp_float; + break; + } + } + + free(valueState_a); + free(valueState_b); + free(valueState); + free(temp); +} + +void test_amx_fp8_dpbhf8ps () +{ + __tilecfg_u cfg; + __tile dst, dst_ref, src1, src2; + uint8_t tmp_dst_zero_buf[1024], tmp_src_buf[1024]; + + init_fp32_max_tile_zero_buffer (tmp_dst_zero_buf); + init_fp8_max_tile_buffer(tmp_src_buf); + + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf); + init_tile_reg_and_src_with_buffer (2, src1, tmp_src_buf); + init_tile_reg_and_src_with_buffer (3, src2, tmp_src_buf); + + calc_matrix_dpbhf8ps (&dst, &src1, &src2); + + _tile_dpbhf8ps (1, 2, 3); + _tile_stored (1, dst_ref.buf, _STRIDE); + + if (!check_float_tile_register (&dst_ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/amxfp8-dphbf8ps-2.c b/gcc/testsuite/gcc.target/i386/amxfp8-dphbf8ps-2.c new file mode 100644 index 0000000..7bb6c1d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxfp8-dphbf8ps-2.c @@ -0,0 +1,120 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_fp8 } */ +/* { dg-options "-O2 -mamx-fp8" } */ + +#define AMX_FP8 +#define DO_TEST test_amx_fp8_dphbf8ps +void test_amx_fp8_dphbf8ps (); + +#include "amx-helper.h" +#include "fp8-helper.h" +#include "fp-emulation.h" + +void +calc_matrix_dphbf8ps (__tile *dst, __tile *src1, __tile *src2) +{ + unsigned char *src1_buf = (unsigned char *)src1->buf; + unsigned char *src2_buf = (unsigned char *)src2->buf; + float *dst_buf = (float *)dst->buf; + + int M = src1->rows; + int N = src1->colsb / 4; + int K = src2->colsb / 4; + + int* valueState_a = (int*)malloc(sizeof(int)); + int* valueState_b = (int*)malloc(sizeof(int)); + int* valueState = (int *)malloc(M * K * sizeof(int)); + __int128_t *temp = (__int128_t *)malloc(M * K * sizeof(__int128_t)); + + for (int i = 0; i < M; i++) + { + for (int j = 0; j < N; j++) + for (int k = 0; k < K; k++) + { + int64_t s1e0 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 0], 0, valueState_a); + int64_t s2e0 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 0], 1, valueState_b); + int valueState_e0 = state_handler(*valueState_a, *valueState_b, s1e0, s2e0, '*'); + + int64_t s1e1 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 1], 0, valueState_a); + int64_t s2e1 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 1], 1, valueState_b); + int valueState_e1 = state_handler(*valueState_a, *valueState_b, s1e1, s2e1, '*'); + + int64_t s1e2 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 2], 0, valueState_a); + int64_t s2e2 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 2], 1, valueState_b); + int valueState_e2 = state_handler(*valueState_a, *valueState_b, s1e2, s2e2, '*'); + + int64_t s1e3 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 3], 0, valueState_a); + int64_t s2e3 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 3], 1, valueState_b); + int valueState_e3 = state_handler(*valueState_a, *valueState_b, s1e3, s2e3, '*'); + + valueState[i * K + k] = state_handler( + state_handler( + state_handler( + state_handler( + valueState_e0, valueState_e1, (__int128_t)s1e0 * s2e0, s1e1 * s2e1, '+'), + valueState_e2, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1, s1e2 * s2e2, '+'), + valueState_e3, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2, s1e3 * s2e3, '+'), + valueState[i * K + k], (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2 + s1e3 * s2e3, temp[i * K + k], '+'); + + temp[i * K + k] += (__int128_t)s1e0 * (__int128_t)s2e0 + + (__int128_t)s1e1 * (__int128_t)s2e1 + + (__int128_t)s1e2 * (__int128_t)s2e2 + + (__int128_t)s1e3 * (__int128_t)s2e3; + } + } + + for (int i = 0; i < M; i++) + for (int k = 0; k < K; k++) + { + float tmp_float = shift_int128_to_fp32(temp[i * K + k], 0, 1); + if (isnan(dst_buf[i * K + k]) || isnan(tmp_float)) + dst_buf[i * K + k] = -nanf(""); + else switch(valueState[i * K + k]) + { + case -3: + dst_buf[i * K + k] = -INFINITY; + break; + case 3: + dst_buf[i * K + k] = 1.0 / 0.0; + break; + case 2: + case -2: + dst_buf[i * K + k] = -nanf(""); + break; + case -1: + case 0: + case 1: + default: + dst_buf[i * K + k] += tmp_float; + break; + } + } + + free(valueState_a); + free(valueState_b); + free(valueState); + free(temp); +} + +void test_amx_fp8_dphbf8ps () +{ + __tilecfg_u cfg; + __tile dst, dst_ref, src1, src2; + uint8_t tmp_dst_zero_buf[1024], tmp_src_buf[1024]; + + init_fp32_max_tile_zero_buffer (tmp_dst_zero_buf); + init_fp8_max_tile_buffer(tmp_src_buf); + + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf); + init_tile_reg_and_src_with_buffer (2, src1, tmp_src_buf); + init_tile_reg_and_src_with_buffer (3, src2, tmp_src_buf); + + calc_matrix_dphbf8ps (&dst, &src1, &src2); + + _tile_dphbf8ps (1, 2, 3); + _tile_stored (1, dst_ref.buf, _STRIDE); + + if (!check_float_tile_register (&dst_ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/amxfp8-dphf8ps-2.c b/gcc/testsuite/gcc.target/i386/amxfp8-dphf8ps-2.c new file mode 100644 index 0000000..0421d62 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxfp8-dphf8ps-2.c @@ -0,0 +1,120 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_fp8 } */ +/* { dg-options "-O2 -mamx-fp8" } */ + +#define AMX_FP8 +#define DO_TEST test_amx_fp8_dphf8ps +void test_amx_fp8_dphf8ps (); + +#include "amx-helper.h" +#include "fp8-helper.h" +#include "fp-emulation.h" + +void +calc_matrix_dphf8ps (__tile *dst, __tile *src1, __tile *src2) +{ + unsigned char *src1_buf = (unsigned char *)src1->buf; + unsigned char *src2_buf = (unsigned char *)src2->buf; + float *dst_buf = (float *)dst->buf; + + int M = src1->rows; + int N = src1->colsb / 4; + int K = src2->colsb / 4; + + int* valueState_a = (int*)malloc(sizeof(int)); + int* valueState_b = (int*)malloc(sizeof(int)); + int* valueState = (int *)malloc(M * K * sizeof(int)); + __int128_t *temp = (__int128_t *)malloc(M * K * sizeof(__int128_t)); + + for (int i = 0; i < M; i++) + { + for (int j = 0; j < N; j++) + for (int k = 0; k < K; k++) + { + int64_t s1e0 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 0], 0, valueState_a); + int64_t s2e0 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 0], 0, valueState_b); + int valueState_e0 = state_handler(*valueState_a, *valueState_b, s1e0, s2e0, '*'); + + int64_t s1e1 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 1], 0, valueState_a); + int64_t s2e1 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 1], 0, valueState_b); + int valueState_e1 = state_handler(*valueState_a, *valueState_b, s1e1, s2e1, '*'); + + int64_t s1e2 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 2], 0, valueState_a); + int64_t s2e2 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 2], 0, valueState_b); + int valueState_e2 = state_handler(*valueState_a, *valueState_b, s1e2, s2e2, '*'); + + int64_t s1e3 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 3], 0, valueState_a); + int64_t s2e3 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 3], 0, valueState_b); + int valueState_e3 = state_handler(*valueState_a, *valueState_b, s1e3, s2e3, '*'); + + valueState[i * K + k] = state_handler( + state_handler( + state_handler( + state_handler( + valueState_e0, valueState_e1, (__int128_t)s1e0 * s2e0, s1e1 * s2e1, '+'), + valueState_e2, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1, s1e2 * s2e2, '+'), + valueState_e3, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2, s1e3 * s2e3, '+'), + valueState[i * K + k], (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2 + s1e3 * s2e3, temp[i * K + k], '+'); + + temp[i * K + k] += (__int128_t)s1e0 * (__int128_t)s2e0 + + (__int128_t)s1e1 * (__int128_t)s2e1 + + (__int128_t)s1e2 * (__int128_t)s2e2 + + (__int128_t)s1e3 * (__int128_t)s2e3; + } + } + + for (int i = 0; i < M; i++) + for (int k = 0; k < K; k++) + { + float tmp_float = shift_int128_to_fp32(temp[i * K + k], 0, 0); + if (isnan(dst_buf[i * K + k]) || isnan(tmp_float)) + dst_buf[i * K + k] = -nanf(""); + else switch(valueState[i * K + k]) + { + case -3: + dst_buf[i * K + k] = -INFINITY; + break; + case 3: + dst_buf[i * K + k] = 1.0 / 0.0; + break; + case 2: + case -2: + dst_buf[i * K + k] = -nanf(""); + break; + case -1: + case 0: + case 1: + default: + dst_buf[i * K + k] += tmp_float; + break; + } + } + + free(valueState_a); + free(valueState_b); + free(valueState); + free(temp); +} + +void test_amx_fp8_dphf8ps () +{ + __tilecfg_u cfg; + __tile dst, dst_ref, src1, src2; + uint8_t tmp_dst_zero_buf[1024], tmp_src_buf[1024]; + + init_fp32_max_tile_zero_buffer (tmp_dst_zero_buf); + init_fp8_max_tile_buffer(tmp_src_buf); + + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf); + init_tile_reg_and_src_with_buffer (2, src1, tmp_src_buf); + init_tile_reg_and_src_with_buffer (3, src2, tmp_src_buf); + + calc_matrix_dphf8ps (&dst, &src1, &src2); + + _tile_dphf8ps (1, 2, 3); + _tile_stored (1, dst_ref.buf, _STRIDE); + + if (!check_float_tile_register (&dst_ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/fp-emulation.h b/gcc/testsuite/gcc.target/i386/fp-emulation.h new file mode 100644 index 0000000..b0689a9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fp-emulation.h @@ -0,0 +1,112 @@ +#ifndef FP_EMULATION_H_INCLUDED +#define FP_EMULATION_H_INCLUDED + +#include <math.h> + +int is_snan(float x) +{ + union32f_ud fb; + fb.f = x; + return __builtin_isnan(x) && (fb.u & 0x00400000) == 0; +} + +int is_qnan(float x) +{ + return __builtin_isnan(x) && !is_snan(x); +} + +#define INTEL_SSE_MATH_OP(op, a, b) \ +({ \ + union32f_ud tmp; \ + float result; \ + if (is_qnan(a)) \ + result = a; \ + else if (is_snan(a)) \ + { \ + tmp.f = a; tmp.u |= 0x400000; result = tmp.f; \ + } else if (is_snan(b)) \ + { \ + tmp.f = b; tmp.u |= 0x400000; result = tmp.f; \ + } else if (is_qnan(b)) \ + result = b; \ + else \ + result = a op b; \ + result; \ +}) + +float +intel_sse_math_add(float a, float b) +{ + return INTEL_SSE_MATH_OP(+, a, b); +} + +float +intel_sse_math_sub(float a, float b) +{ + return INTEL_SSE_MATH_OP(-, a, b); +} + +float +intel_sse_math_mul(float a, float b) +{ + return INTEL_SSE_MATH_OP(*, a, b); +} + +float +intel_sse_math_div(float a, float b) +{ + return INTEL_SSE_MATH_OP(/, a, b); +} + +/* +-3 == +-inf, + +-2 == +-Nan, use -2 only, + +-1 == +- normal number, + 0 == undefined/not intialized */ + +int +state_handler(int src0, int src1, __int128 a, __int128 b, char op) +{ + /* Nan */ + if (src0 == -2 || src1 == -2) + return -2; + + if (abs(src0) == 3 || abs(src1) == 3) + { + /* +INF + +INF = +INF, -INF + -INF = -INF */ + if (src0 == src1) + return src0; + + /* Positive result */ + if ((op == '*' && ((src0 < 0 && b < 0) || (src1 < 0 && a < 0))) || + (op == '*' && ((src0 > 0 && b > 0) || (src1 > 0 && a > 0)))) + return 3; + + /* -INF * (positive normal) = -INF */ + if (op == '*' && ((src0 < 0 || src1 < 0) && (a >= 0 && b >= 0))) + return -3; + + /* INF * 0 = NaN */ + if (a == 0 || b == 0) + return -2; + + /* -INF + +INF = NaN, else +INF or -INF */ + return (op == '+' && src0 != src1) ? -2 : 3 * ((src0 / 3) + (src1 / 3)); + } + + /* Normal number case */ + __int128 result = op == '+' ? a + b : a * b; + return result < 0 ? -1 : 1; +} + +int +state_handler_float (float src) +{ + if (isnan (src)) + return -2; + else if (isinf (src)) + return src > 0 ? 3 : -3; + else + return src >= 0 ? 1 : -1; +} + +#endif diff --git a/gcc/testsuite/gcc.target/i386/fp8-helper.h b/gcc/testsuite/gcc.target/i386/fp8-helper.h index b486db5..d3f526f 100644 --- a/gcc/testsuite/gcc.target/i386/fp8-helper.h +++ b/gcc/testsuite/gcc.target/i386/fp8-helper.h @@ -1,12 +1,21 @@ #ifndef FP8_HELPER_UNCLUDED #define FP8_HELPER_UNCLUDED +#include <stdint.h> +#include <inttypes.h> + typedef union { _Float16 f16; unsigned short u16; } Float16Union; +typedef union +{ + float f; + uint32_t u; +} Floatuint32Union; + static unsigned char convert_fp16_to_hf8 (_Float16 x, unsigned char b, int s) { @@ -132,4 +141,132 @@ convert_hf8_to_fp16(unsigned char x) return res.f16; } +static float +convert_bf8_to_fp32 (unsigned char x) +{ + Float16Union u = {.u16 = (x << 8) & 0xff00}; + return (float)(u.f16); +} + +static float +convert_hf8_to_fp32 (unsigned char x) +{ + Floatuint32Union res = {.f = 0.0f}; + unsigned int s = (x & 0x80) << 24, e = (x & 0x78) >> 3, m = x & 0x07; + unsigned int e_norm = e + 120; + + if (!e && m) + { + unsigned int lz_cnt = 2 - (m > 1) - (m > 3); + e_norm -= lz_cnt; + m = (m << (lz_cnt + 1) & 0x7); + } + else if (!e) + e_norm = 0; + else if (e == 0xf && m == 0x7) + e_norm = 255, m = 4; + + res.u |= (e_norm << 23) | (m << 20) | s; + + return res.f; +} + +static float +convert_fp8_to_fp32 (unsigned char x, int y) +{ + return y ? convert_bf8_to_fp32 (x) + : convert_hf8_to_fp32 (x); +} + +static int +fp8_isNan(unsigned char value, int bf8) +{ + unsigned char e = bf8 ? (value >> 2) & 0b11111 : (value >> 3) & 0b1111; + unsigned char m = bf8 ? value & 0b11 : value & 0b111; + + return bf8 ? (e == 0b11111 && m != 0) : (e == 0b1111 && m == 0b111); +} + +static int +bf8_isInf(unsigned char value) +{ + unsigned char sign = value >> 7; + unsigned char e = (value >> 2) & 0b11111; + unsigned char m = value & 0b11; + + if (e == 0b11111 && !m) + return sign ? -1 : 1; + return 0; +} + +#ifdef __x86_64__ +/* type = 0 for hf8 and type = 1 for bf8 */ +/* value is 2^16*x for bf8 and 2^9*x for hf8 */ +/* +-3 means +-inf, + +-2 means +-Nan, + and we will use -2 only, + +-1 means positive and negative no rmal numbers, + 0 means undefined/not initialised */ +static int64_t +shift_fp8_to_int64 (unsigned char x, int bf8, int *valueState) +{ + *valueState = (x >> 7) ? -1 : 1; + if (fp8_isNan (x, bf8)) + *valueState = -2; + if (bf8) + if (bf8_isInf(x) == 1) + *valueState = 3; + else if (bf8_isInf(x) == -1) + *valueState = -3; + + unsigned short sign = (x & 0x80) >> 7; + unsigned short exp = bf8 ? (x & 0x7c) >> 2 : (x & 0x78) >> 3; + unsigned short frac = bf8 ? (x & 0x03) : (x & 0x07); + unsigned short mant = (exp == 0) ? frac : (bf8 ? (frac | 0x4) : (frac | 0x8)); + unsigned short e_count = (exp == 0) ? 0 : exp - 1; + int64_t magnitude = (int64_t)mant << (int64_t)e_count; + + return sign ? -magnitude : magnitude; +} + + +/* type = 0 for hf8 and type = 1 for bf8 */ +static float +shift_int128_to_fp32 (__int128_t in, int type1, int type2) +{ + if (in == 0) + return 0; + + unsigned short sign = (in >> 127) & 1; + unsigned short Jbit_position = 126; + unsigned short fac = (type1 == type2) ? ( (type1) ? 32 : 18) : 25; + + __int128_t magnitude = sign ? -in : in; + + while (((magnitude >> 126) & 1) == 0) + { + Jbit_position --; + magnitude <<= 1; + } + + __int128_t sticky = (magnitude & (((__int128_t)1 << 102) - 1)) != 0; + __int128_t Gbit = (magnitude >> 102) & 1; + __int128_t Lbit = (magnitude >> 103) & 1; + __int128_t RndAddl = Gbit & ( Lbit | sticky); + __int128_t mant = (magnitude >> 103) & (((__int128_t)1 << 25) - 1); + + __int128_t Rndmant = mant + RndAddl; + __int128_t Ovf = Rndmant >> 24; + __int128_t exp = 127 + Jbit_position - fac + Ovf; + __int128_t frac = Rndmant & 0x7FFFFF; + + Floatuint32Union res; + res.u = sign << 31; + res.u |= exp << 23; + res.u |= frac; + + return res.f; +} +#endif + #endif diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc index 5a977ff..4aa6d7e 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc +++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc @@ -92,6 +92,7 @@ extern void test_avx10_2_512 (void) __attribute__((__target__("avx10.2-512"))); extern void test_amx_avx512 (void) __attribute__((__target__("amx-avx512"))); extern void test_amx_tf32 (void) __attribute__((__target__("amx-tf32"))); extern void test_amx_transpose (void) __attribute__((__target__("amx-transpose"))); +extern void test_amx_fp8 (void) __attribute__((__target__("amx-fp8"))); extern void test_no_sgx (void) __attribute__((__target__("no-sgx"))); extern void test_no_avx512vpopcntdq(void) __attribute__((__target__("no-avx512vpopcntdq"))); @@ -185,6 +186,7 @@ extern void test_no_avx10_2_512 (void) __attribute__((__target__("no-avx10.2-51 extern void test_no_amx_avx512 (void) __attribute__((__target__("no-amx-avx512"))); extern void test_no_amx_tf32 (void) __attribute__((__target__("no-amx-tf32"))); extern void test_no_amx_transpose (void) __attribute__((__target__("no-amx-transpose"))); +extern void test_no_amx_fp8 (void) __attribute__((__target__("no-amx-fp8"))); extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona"))); extern void test_arch_core2 (void) __attribute__((__target__("arch=core2"))); diff --git a/gcc/testsuite/gcc.target/i386/sse-12.c b/gcc/testsuite/gcc.target/i386/sse-12.c index d13b606..f9568c9 100644 --- a/gcc/testsuite/gcc.target/i386/sse-12.c +++ b/gcc/testsuite/gcc.target/i386/sse-12.c @@ -3,7 +3,7 @@ popcntintrin.h gfniintrin.h and mm_malloc.h are usable with -O -std=c89 -pedantic-errors. */ /* { dg-do compile } */ -/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */ +/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mamx-fp8" } */ #include <x86intrin.h> diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c index b24b513..ff4ac6a 100644 --- a/gcc/testsuite/gcc.target/i386/sse-13.c +++ b/gcc/testsuite/gcc.target/i386/sse-13.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */ +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mamx-fp8" } */ /* { dg-add-options bind_pic_locally } */ #include <mm_malloc.h> diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c index 03d21dc..2146cf5 100644 --- a/gcc/testsuite/gcc.target/i386/sse-14.c +++ b/gcc/testsuite/gcc.target/i386/sse-14.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */ +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mamx-fp8" } */ /* { dg-add-options bind_pic_locally } */ #include <mm_malloc.h> diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c index 7026d03..684ebfc 100644 --- a/gcc/testsuite/gcc.target/i386/sse-22.c +++ b/gcc/testsuite/gcc.target/i386/sse-22.c @@ -103,7 +103,7 @@ #ifndef DIFFERENT_PRAGMAS -#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose") +#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose,amx-fp8") #endif /* Following intrinsics require immediate arguments. They @@ -220,7 +220,7 @@ test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1) /* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */ #ifdef DIFFERENT_PRAGMAS -#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose") +#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose,amx-fp8") #endif #include <immintrin.h> test_1 (_cvtss_sh, unsigned short, float, 1) diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c index 88eabdf..8f3c379 100644 --- a/gcc/testsuite/gcc.target/i386/sse-23.c +++ b/gcc/testsuite/gcc.target/i386/sse-23.c @@ -1082,6 +1082,6 @@ #define __builtin_ia32_minmaxps128_mask(A, B, C, D, E) __builtin_ia32_minmaxps128_mask (A, B, 100, D, E) #define __builtin_ia32_minmaxps256_mask_round(A, B, C, D, E, F) __builtin_ia32_minmaxps256_mask_round (A, B, 100, D, E, 4) -#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose") +#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose,amx-fp8") #include <x86intrin.h> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 244d2d0..11fe21b 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -10808,6 +10808,17 @@ proc check_effective_target_amx_transpose { } { } "-mamx-transpose" ] } +# Return 1 if amx-fp8 instructions can be compiled. +proc check_effective_target_amx_fp8 { } { + return [check_no_compiler_messages amx_fp8 object { + void + foo () + { + __asm__ volatile ("tdpbf8ps\t%%tmm1, %%tmm2, %%tmm3" ::); + } + } "-mamx-fp8" ] +} + # Return 1 if sse instructions can be compiled. proc check_effective_target_sse { } { return [check_no_compiler_messages sse object { |