diff options
author | Haochen Jiang <haochen.jiang@intel.com> | 2024-11-01 10:04:38 +0800 |
---|---|---|
committer | Haochen Jiang <haochen.jiang@intel.com> | 2024-11-01 10:10:39 +0800 |
commit | 9f2f36a7db9070a9d6e1f0fb736a12217651d169 (patch) | |
tree | 7419168f2d410f14b2a44bc2f979723a478335f3 /gcc | |
parent | 8cc38abf575381905eb3a869b0874bdaddb608bb (diff) | |
download | gcc-9f2f36a7db9070a9d6e1f0fb736a12217651d169.zip gcc-9f2f36a7db9070a9d6e1f0fb736a12217651d169.tar.gz gcc-9f2f36a7db9070a9d6e1f0fb736a12217651d169.tar.bz2 |
Support Intel AMX-TRANSPOSE
gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_available_features):
Detect AMX-TRANSPOSE.
* common/config/i386/i386-common.cc
(OPTION_MASK_ISA2_AMX_TRANSPOSE_SET,
OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET): New.
(ix86_handle_option): Handle -mamx-transpose.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_AMX_TRANSPOSE.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
amx-transpose.
* config.gcc: Add amxtransposeintrin.h.
* config/i386/cpuid.h (bit_AMX_TRANSPOSE): New.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__AMX_TRANSPOSE__.
* config/i386/i386-isa.def (AMX_TRANSPOSE): Add
DEF_PTA(AMX_TRANSPOSE).
* config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p):
Handle amx-transpose.
* config/i386/i386.opt: Add option -mamx-transpose.
* config/i386/i386.opt.urls: Regenerated.
* config/i386/immintrin.h: Include amxtransposeintrin.h.
* doc/extend.texi: Document amx-transpose.
* doc/invoke.texi: Document -mamx-transpose.
* doc/sourcebuild.texi: Document target amx-transpose.
* config/i386/amxtransposeintrin.h: New file.
gcc/testsuite/ChangeLog:
* g++.dg/other/i386-2.C: Add -mamx-transpose.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/amx-check.h: Add new check for amx-transpose.
(__tilepair): New.
(zero_pair_tile_src): New.
(check_pair_tile_register): New.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/amx-helper.h: Add amx-transpose support.
(init_pair_tile_src): New function.
* gcc.target/i386/sse-12.c: Add -mamx-tranpose.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add amx-transpose.
* gcc.target/i386/sse-23.c: Ditto.
* lib/target-supports.exp (check_effective_target_amx_transposed): New.
* gcc.target/i386/amxtranspose-asmatt-1.c: New test.
* gcc.target/i386/amxtranspose-asmintel-1.c: Ditto.
* gcc.target/i386/amxtranspose-2rpntlvw-2.c: Ditto.
* gcc.target/i386/amxtranspose-conjtcmmimfp16ps-2.c: Ditto.
* gcc.target/i386/amxtranspose-conjtfp16-2.c: Ditto.
* gcc.target/i386/amxtranspose-tcmmimfp16ps-2.c: Ditto.
* gcc.target/i386/amxtranspose-tcmmrlfp16ps-2.c: Ditto.
* gcc.target/i386/amxtranspose-tdpbf16ps-2.c: Ditto.
* gcc.target/i386/amxtranspose-tdpfp16ps-2.c: Ditto.
* gcc.target/i386/amxtranspose-tmmultf32ps-2.c: Ditto.
* gcc.target/i386/amxtranspose-transposed-2.c: Ditto.
Diffstat (limited to 'gcc')
38 files changed, 857 insertions, 16 deletions
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 5d0a644..5a6aed0 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -1005,6 +1005,8 @@ get_available_features (struct __processor_model *cpu_model, set_feature (FEATURE_AMX_AVX512); if (eax & bit_AMX_TF32) set_feature (FEATURE_AMX_TF32); + if (eax & bit_AMX_TRANSPOSE) + set_feature (FEATURE_AMX_TRANSPOSE); } } diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index 74ad039..a85b380 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -132,6 +132,8 @@ along with GCC; see the file COPYING3. If not see | OPTION_MASK_ISA2_AMX_AVX512) #define OPTION_MASK_ISA2_AMX_TF32_SET \ (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_TF32) +#define OPTION_MASK_ISA2_AMX_TRANSPOSE_SET \ + (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_TRANSPOSE) /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same as -msse4.2. */ @@ -295,7 +297,7 @@ along with GCC; see the file COPYING3. If not see (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_INT8_UNSET \ | OPTION_MASK_ISA2_AMX_BF16_UNSET | OPTION_MASK_ISA2_AMX_FP16_UNSET \ | OPTION_MASK_ISA2_AMX_COMPLEX_UNSET | OPTION_MASK_ISA2_AMX_AVX512_UNSET \ - | OPTION_MASK_ISA2_AMX_TF32_UNSET) + | OPTION_MASK_ISA2_AMX_TF32_UNSET | OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET) #define OPTION_MASK_ISA2_AMX_INT8_UNSET OPTION_MASK_ISA2_AMX_INT8 #define OPTION_MASK_ISA2_AMX_BF16_UNSET OPTION_MASK_ISA2_AMX_BF16 #define OPTION_MASK_ISA2_UINTR_UNSET OPTION_MASK_ISA2_UINTR @@ -327,6 +329,7 @@ along with GCC; see the file COPYING3. If not see (OPTION_MASK_ISA2_AVX10_2_512 | OPTION_MASK_ISA2_AMX_AVX512_UNSET) #define OPTION_MASK_ISA2_AMX_AVX512_UNSET OPTION_MASK_ISA2_AMX_AVX512 #define OPTION_MASK_ISA2_AMX_TF32_UNSET OPTION_MASK_ISA2_AMX_TF32 +#define OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET OPTION_MASK_ISA2_AMX_TRANSPOSE /* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same as -mno-sse4.1. */ @@ -1446,6 +1449,20 @@ ix86_handle_option (struct gcc_options *opts, } return true; + case OPT_mamx_transpose: + if (value) + { + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_TRANSPOSE_SET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_TRANSPOSE_SET; + } + else + { + opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET; + opts->x_ix86_isa_flags2_explicit |= + OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET; + } + return true; + case OPT_mfma: if (value) { diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h index d19de25..f5f8ba2 100644 --- a/gcc/common/config/i386/i386-cpuinfo.h +++ b/gcc/common/config/i386/i386-cpuinfo.h @@ -271,6 +271,7 @@ enum processor_features FEATURE_AVX10_2_512, FEATURE_AMX_AVX512, FEATURE_AMX_TF32, + FEATURE_AMX_TRANSPOSE, CPU_FEATURE_MAX }; diff --git a/gcc/common/config/i386/i386-isas.h b/gcc/common/config/i386/i386-isas.h index 0ba2e88..28c8d07 100644 --- a/gcc/common/config/i386/i386-isas.h +++ b/gcc/common/config/i386/i386-isas.h @@ -192,4 +192,6 @@ ISA_NAMES_TABLE_START ISA_NAMES_TABLE_ENTRY("amx-avx512", FEATURE_AMX_AVX512, P_NONE, "-mamx-avx512") ISA_NAMES_TABLE_ENTRY("amx-tf32", FEATURE_AMX_TF32, P_NONE, "-mamx-tf32") + ISA_NAMES_TABLE_ENTRY("amx-transpose", FEATURE_AMX_TRANSPOSE, + P_NONE, "-mamx-transpose") ISA_NAMES_TABLE_END diff --git a/gcc/config.gcc b/gcc/config.gcc index 58fecf7..5959714 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -458,7 +458,8 @@ i[34567]86-*-* | x86_64-*-*) avx10_2bf16intrin.h avx10_2-512bf16intrin.h avx10_2satcvtintrin.h avx10_2-512satcvtintrin.h avx10_2minmaxintrin.h avx10_2-512minmaxintrin.h - avx10_2copyintrin.h amxavx512intrin.h amxtf32intrin.h" + avx10_2copyintrin.h amxavx512intrin.h amxtf32intrin.h + amxtransposeintrin.h" ;; ia64-*-*) extra_headers=ia64intrin.h diff --git a/gcc/config/i386/amxtransposeintrin.h b/gcc/config/i386/amxtransposeintrin.h new file mode 100644 index 0000000..06bdd37 --- /dev/null +++ b/gcc/config/i386/amxtransposeintrin.h @@ -0,0 +1,177 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use <amxtransposeintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AMXTRANSPOSEINTRIN_H_INCLUDED +#define _AMXTRANSPOSEINTRIN_H_INCLUDED + +#if !defined(__AMX_TRANSPOSE__) +#pragma GCC push_options +#pragma GCC target("amx-transpose") +#define __DISABLE_AMX_TRANSPOSE__ +#endif /* __AMX_TRANSPOSE__ */ + +#if defined(__x86_64__) +#define _tile_transposed_internal(dst,src) \ + __asm__ volatile\ + ("{ttransposed\t%%tmm"#src", %%tmm"#dst"|ttransposed\t%%tmm"#dst", %%tmm"#src"}" ::) + +#define _tile_2rpntlvwz0_internal(dst,base,stride) \ + __asm__ volatile\ + ("{t2rpntlvwz0\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz0\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((long) (stride))) + +#define _tile_2rpntlvwz0t1_internal(dst,base,stride) \ + __asm__ volatile\ + ("{t2rpntlvwz0t1\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz0t1\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*)(base)), "r" ((long)(stride))) + +#define _tile_2rpntlvwz1_internal(dst,base,stride) \ + __asm__ volatile\ + ("{t2rpntlvwz1\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz1\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*)(base)), "r" ((long)(stride))) + +#define _tile_2rpntlvwz1t1_internal(dst,base,stride) \ + __asm__ volatile\ + ("{t2rpntlvwz1t1\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz1t1\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*)(base)), "r" ((long)(stride))) + +#define _tile_transposed(dst,src) \ + _tile_transposed_internal (dst, src) + +#define _tile_2rpntlvwz0(dst,base,stride) \ + _tile_2rpntlvwz0_internal (dst, base, stride) + +#define _tile_2rpntlvwz0t1(dst,base,stride) \ + _tile_2rpntlvwz0t1_internal (dst, base, stride) + +#define _tile_2rpntlvwz1(dst,base,stride) \ + _tile_2rpntlvwz1_internal (dst, base, stride) + +#define _tile_2rpntlvwz1t1(dst,base,stride) \ + _tile_2rpntlvwz1t1_internal (dst, base, stride) + +#if !defined(__AMX_BF16__) +#pragma GCC push_options +#pragma GCC target("amx-bf16") +#define __DISABLE_AMX_BF16__ +#endif /* __AMX_BF16__ */ + +#define _tile_tdpbf16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{ttdpbf16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttdpbf16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_tdpbf16ps(src1_dst,src2,src3) \ + _tile_tdpbf16ps_internal (src1_dst, src2, src3) + +#ifdef __DISABLE_AMX_BF16__ +#undef __DISABLE_AMX_BF16__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_BF16__ */ + +#if !defined(__AMX_FP16__) +#pragma GCC push_options +#pragma GCC target("amx-fp16") +#define __DISABLE_AMX_FP16__ +#endif /* __AMX_FP16__ */ + +#define _tile_tdpfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{ttdpfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttdpfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_tdpfp16ps(src1_dst,src2,src3) \ + _tile_tdpfp16ps_internal (src1_dst, src2, src3) + +#ifdef __DISABLE_AMX_FP16__ +#undef __DISABLE_AMX_FP16__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_FP16__ */ + +#if !defined(__AMX_COMPLEX__) +#pragma GCC push_options +#pragma GCC target("amx-complex") +#define __DISABLE_AMX_COMPLEX__ +#endif /* __AMX_COMPLEX__ */ + +#define _tile_conjtcmmimfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{tconjtcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tconjtcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_conjtfp16_internal(dst,src) \ + __asm__ volatile\ + ("{tconjtfp16\t%%tmm"#src", %%tmm"#dst"|tconjtfp16\t%%tmm"#dst", %%tmm"#src"}" ::) + +#define _tile_tcmmimfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{ttcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_tcmmrlfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{ttcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_conjtcmmimfp16ps(src1_dst,src2,src3) \ + _tile_conjtcmmimfp16ps_internal (src1_dst, src2, src3) + +#define _tile_conjtfp16(dst,src) \ + _tile_conjtfp16_internal (dst, src) + +#define _tile_tcmmimfp16ps(src1_dst,src2,src3) \ + _tile_tcmmimfp16ps_internal (src1_dst, src2, src3) + +#define _tile_tcmmrlfp16ps(src1_dst,src2,src3) \ + _tile_tcmmrlfp16ps_internal (src1_dst, src2, src3) + +#ifdef __DISABLE_AMX_COMPLEX__ +#undef __DISABLE_AMX_COMPLEX__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_COMPLEX__ */ + +#if !defined(__AMX_TF32__) +#pragma GCC push_options +#pragma GCC target("amx-tf32") +#define __DISABLE_AMX_TF32__ +#endif /* __AMX_TF32__ */ + +#define _tile_tmmultf32ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{ttmmultf32ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttmmultf32ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_tmmultf32ps(src1_dst,src2,src3) \ + _tile_tmmultf32ps_internal (src1_dst, src2, src3) + +#ifdef __DISABLE_AMX_TF32__ +#undef __DISABLE_AMX_TF32__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_TF32__ */ + +#endif /* __x86_64__ */ + +#ifdef __DISABLE_AMX_TRANSPOSE__ +#undef __DISABLE_AMX_TRANSPOSE__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_TRANSPOSE__ */ + +#endif /* _AMXTRANSPOSEINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index 7cd8319..d609be9 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -164,6 +164,7 @@ /* AMX sub leaf (%eax == 0x1e, %ecx == 1) */ /* %eax */ +#define bit_AMX_TRANSPOSE (1 << 5) #define bit_AMX_TF32 (1 << 6) #define bit_AMX_AVX512 (1 << 7) diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc index 98cb676..ee7497d 100644 --- a/gcc/config/i386/i386-c.cc +++ b/gcc/config/i386/i386-c.cc @@ -745,6 +745,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__AMX_AVX512__"); if (isa_flag2 & OPTION_MASK_ISA2_AMX_TF32) def_or_undef (parse_in, "__AMX_TF32__"); + if (isa_flag2 & OPTION_MASK_ISA2_AMX_TRANSPOSE) + def_or_undef (parse_in, "__AMX_TRANSPOSE__"); if (TARGET_IAMCU) { def_or_undef (parse_in, "__iamcu"); diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def index 1b82a69..ab8c2b8 100644 --- a/gcc/config/i386/i386-isa.def +++ b/gcc/config/i386/i386-isa.def @@ -125,3 +125,4 @@ DEF_PTA(AVX10_2_256) DEF_PTA(AVX10_2_512) DEF_PTA(AMX_AVX512) DEF_PTA(AMX_TF32) +DEF_PTA(AMX_TRANSPOSE) diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 6f68e79..03808c4 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -265,7 +265,8 @@ static struct ix86_target_opts isa2_opts[] = { "-mavx10.2-256", OPTION_MASK_ISA2_AVX10_2_256 }, { "-mavx10.2-512", OPTION_MASK_ISA2_AVX10_2_512 }, { "-mamx-avx512", OPTION_MASK_ISA2_AMX_AVX512 }, - { "-mamx-tf32", OPTION_MASK_ISA2_AMX_TF32 } + { "-mamx-tf32", OPTION_MASK_ISA2_AMX_TF32 }, + { "-mamx-transpose", OPTION_MASK_ISA2_AMX_TRANSPOSE } }; static struct ix86_target_opts isa_opts[] = { @@ -1136,6 +1137,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], IX86_ATTR_ISA ("avx10.2-512", OPT_mavx10_2_512), IX86_ATTR_ISA ("amx-avx512", OPT_mamx_avx512), IX86_ATTR_ISA ("amx-tf32", OPT_mamx_tf32), + IX86_ATTR_ISA ("amx-transpose", OPT_mamx_transpose), /* enum options */ IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 280ad77..4922cad 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1398,3 +1398,7 @@ AVX10.2-512 and AMX-AVX512 built-in functions and code generation. mamx-tf32 Target Mask(ISA2_AMX_TF32) Var(ix86_isa_flags2) Save Support AMX-TF32 built-in functions and code generation. + +mamx-transpose +Target Mask(ISA2_AMX_TRANSPOSE) Var(ix86_isa_flags2) Save +Support AMX-TRANSPOSE built-in functions and code generation. diff --git a/gcc/config/i386/i386.opt.urls b/gcc/config/i386/i386.opt.urls index 9921cda..cda5484 100644 --- a/gcc/config/i386/i386.opt.urls +++ b/gcc/config/i386/i386.opt.urls @@ -619,3 +619,6 @@ UrlSuffix(gcc/x86-Options.html#index-mamx-avx512) mamx-tf32 UrlSuffix(gcc/x86-Options.html#index-mamx-tf32) +mamx-transpose +UrlSuffix(gcc/x86-Options.html#index-mamx-transpose) + diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index 84b8f60..a870cc6 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -136,6 +136,8 @@ #include <amxtf32intrin.h> +#include <amxtransposeintrin.h> + #include <prfchwintrin.h> #include <keylockerintrin.h> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 44cbe7c..73fc0c9 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -7554,6 +7554,11 @@ Enable/disable the generation of the AMX-AVX512 instructions. @itemx no-amx-tf32 Enable/disable the generation of the AMX-TF32 instructions. +@cindex @code{target("amx-transpose")} function attribute, x86 +@item amx-transpose +@itemx no-amx-transpose +Enable/disable the generation of the AMX-TRANSPOSE instructions. + @cindex @code{target("cld")} function attribute, x86 @item cld @itemx no-cld diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 15b7122..b2bb9e2 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -1486,7 +1486,7 @@ See RS/6000 and PowerPC Options. -mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mapxf -musermsr -mavx10.1 -mavx10.1-256 -mavx10.1-512 -mevex512 -mavx10.2 -mavx10.2-256 --mavx10.2-512 -mamx-avx512 -mamx-tf32 +-mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mcldemote -mms-bitfields -mno-align-stringops -minline-all-stringops -minline-stringops-dynamically -mstringop-strategy=@var{alg} -mkl -mwidekl @@ -35680,6 +35680,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @need 200 @opindex mamx-tf32 @itemx -mamx-tf32 +@need 200 +@opindex mamx-transpose +@itemx -mamx-transpose These switches enable the use of instructions in the MMX, SSE, AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA, AES, PCLMUL, CLFLUSHOPT, CLWB, FSGSBASE, PTWRITE, RDRND, F16C, FMA, PCONFIG, @@ -35690,9 +35693,9 @@ WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16, ENQCMD, AVX512VPOPCNTDQ, AVX512VNNI, SERIALIZE, UINTR, HRESET, AMXTILE, AMXINT8, AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16, AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD, AMX-FP16, PREFETCHI, RAOINT, AMX-COMPLEX, AVXVNNIINT16, SM3, SHA512, -SM4, APX_F, USER_MSR, AVX10.1, AVX10.2, AMX-AVX512, AMX-TF32 or CLDEMOTE -extended instruction sets. Each has a corresponding @option{-mno-} option to -disable use of these instructions. +SM4, APX_F, USER_MSR, AVX10.1, AVX10.2, AMX-AVX512, AMX-TF32, AMX-TRANSPOSE or +CLDEMOTE extended instruction sets. Each has a corresponding @option{-mno-} +option to disable use of these instructions. These extensions are also available as built-in functions: see @ref{x86 Built-in Functions}, for details of the functions enabled and diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index 5bb4bf1..09831b6 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -2656,6 +2656,9 @@ Target supports the execution of @code{amx-fp16} instructions. @item amx_tf32 Target supports the execution of @code{amx-tf32} instructions. +@item amx_transpose +Target supports the execution of @code{amx-transpose} instructions. + @item cell_hw Test system can execute AltiVec and Cell PPU instructions. diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C index df985f1..05461ed 100644 --- a/gcc/testsuite/g++.dg/other/i386-2.C +++ b/gcc/testsuite/g++.dg/other/i386-2.C @@ -1,5 +1,5 @@ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32" } */ +/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */ /* { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib } } */ /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h, diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C index 0fa8bc7..0e5df7a 100644 --- a/gcc/testsuite/g++.dg/other/i386-3.C +++ b/gcc/testsuite/g++.dg/other/i386-3.C @@ -1,5 +1,5 @@ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32" } */ +/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */ /* { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib } } */ /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h, diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h index e5e3522..28a71cb 100644 --- a/gcc/testsuite/gcc.target/i386/amx-check.h +++ b/gcc/testsuite/gcc.target/i386/amx-check.h @@ -50,6 +50,14 @@ typedef struct __tile int colsb; } __tile; +typedef struct __tilepair +{ + /* Max size of tile register */ + uint8_t buf[2048]; + int rows; + int colsb; +} __tilepair; + /* Maxium col/row size in bytes */ #define MAX_ROWS 16 #define MAX_COLS 64 @@ -141,6 +149,12 @@ void zero_tile_src (__tile *src) src->buf[i * src->colsb + j] = 0; } +/* Zero __tilepair src. It should be init first. */ +void zero_pair_tile_src (__tilepair *src) +{ + memset(src->buf, 0, 2048); +} + /* Compare tile config value with __tilecfg_u dst */ int check_tile_config (__tilecfg_u *src, __tilecfg_u *dst) { @@ -191,6 +205,27 @@ int check_float_tile_register (__tile* ref, __tile* target) return 1; } +/* Compare pair_tile register value with __tile variable */ +int check_pair_tile_register (__tile* ref_0, __tile* ref_1, __tilepair* target) +{ + /* Tile register should be stored from tmm to + memory and compare with emulation results. */ + int rows = target->rows; + int colsb = target->colsb; + int i, j; + + for (i = 0; i < rows; i++) + for (j = 0; j < colsb; j++) + { + if (ref_0->buf[i * colsb + j] != target->buf[i * colsb + j]) + return 0; + if (ref_1->buf[i * colsb + j] != target->buf[rows * colsb + i * colsb + j]) + return 0; + } + + return 1; +} + #ifndef DO_TEST #define DO_TEST do_test static void test_amx (void); @@ -225,6 +260,9 @@ main () #ifdef AMX_TF32 && __builtin_cpu_supports ("amx-tf32") #endif +#ifdef AMX_TRANSPOSE + && __builtin_cpu_supports ("amx-transpose") +#endif #ifdef __linux__ && request_perm_xtile_data () #endif diff --git a/gcc/testsuite/gcc.target/i386/amx-helper.h b/gcc/testsuite/gcc.target/i386/amx-helper.h index 0fdea0c..b505581 100644 --- a/gcc/testsuite/gcc.target/i386/amx-helper.h +++ b/gcc/testsuite/gcc.target/i386/amx-helper.h @@ -74,7 +74,7 @@ void init_fp16_max_tile_zero_buffer (uint8_t* buf) } #endif -#if defined (AMX_AVX512) +#if defined (AMX_AVX512) || defined (AMX_BF16) /* Transformation functions between bf16/float */ static uint16_t make_f32_bf16 (float f) { @@ -178,4 +178,42 @@ static float silence_snan_fp32 (float x) return tmp.f; } +void init_pair_tile_src (int tmm_num, __tilepair *src, uint8_t *_buffer, int z) +{ + int rows, colsb, start, i, j, t, elements[2]; + uint16_t *buffer = (uint16_t *) _buffer; + uint16_t *ptr = (uint16_t *) src->buf; + __tilecfg_u tmp; + + _tile_storeconfig (tmp.a); + + tmm_num &= ~1; + + rows = tmp.s.rows[tmm_num]; + colsb = tmp.s.colsb[tmm_num]; + start = tmp.s.start_row; + + zero_pair_tile_src (src); + + for (t = 0; t < 2; t++) + elements[t] = tmp.s.colsb[tmm_num + t] / 4; + + src->colsb = (tmp.s.colsb[tmm_num] + tmp.s.colsb[tmm_num + 1]) / 2; + src->rows = rows; + + while (start < 2 * rows) + { + int r = start / 2; + int w = start % 2; + + if (start < 2 * rows - z) + for (t = 0; t < 2; t++) + if (tmp.s.colsb[tmm_num + t] > 0) + for (i = 0; i < elements[t]; i++) + ptr[t * rows * colsb / 2 + r * elements[t] * 2 + 2 * i + w] = + buffer[start * colsb / 2 + t * elements[0] + i]; + start++; + } +} + #endif diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-2rpntlvw-2.c b/gcc/testsuite/gcc.target/i386/amxtranspose-2rpntlvw-2.c new file mode 100644 index 0000000..3b1c870 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-2rpntlvw-2.c @@ -0,0 +1,41 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_transpose } */ +/* { dg-options "-O2 -mamx-transpose" } */ +#define AMX_TRANSPOSE +#define DO_TEST test_amx_transpose_t2rpntlvw +void test_amx_transpose_t2rpntlvw (); +#include "amx-helper.h" +#define init_pair_tile_reg_and_src_z(tmm_num, src, buffer, ztype) \ +{ \ + init_pair_tile_src (tmm_num, &src, buffer, ztype); \ + _tile_2rpntlvwz##ztype (tmm_num, buffer, _STRIDE); \ +} + +void test_amx_transpose_t2rpntlvw () +{ + __tilecfg_u cfg; + __tilepair src; + __tile ref_0, ref_1; + uint8_t buffer[2048]; + int i; + + init_tile_config (&cfg); + + for (i = 0; i < 2048; i++) + buffer[i] = i % 256; + + /* Check t2rpntlvwz0. */ + init_pair_tile_reg_and_src_z (0, src, buffer, 0); + _tile_stored (0, ref_0.buf, _STRIDE); + _tile_stored (1, ref_1.buf, _STRIDE); + if (!check_pair_tile_register (&ref_0, &ref_1, &src)) + abort (); + + /* Check t2rpntlvwz1. */ + init_pair_tile_reg_and_src_z (1, src, buffer, 1); + _tile_stored (0, ref_0.buf, _STRIDE); + _tile_stored (1, ref_1.buf, _STRIDE); + if (!check_pair_tile_register (&ref_0, &ref_1, &src)) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-asmatt-1.c b/gcc/testsuite/gcc.target/i386/amxtranspose-asmatt-1.c new file mode 100644 index 0000000..a970f5d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-asmatt-1.c @@ -0,0 +1,39 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mamx-transpose -mamx-bf16 -mamx-complex -mamx-fp16 -mamx-tf32" } */ +/* { dg-final { scan-assembler "ttdpbf16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +/* { dg-final { scan-assembler "ttdpfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +/* { dg-final { scan-assembler "ttransposed\[ \\t]+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +/* { dg-final { scan-assembler "t2rpntlvwz0\[ \\t]+\[^\n\]*\\(%\[a-z0-9]*\,%\[a-z0-9\]*\,\[124\]\\)+\[^\n\]*%tmm\[0-9\]" } } */ +/* { dg-final { scan-assembler "t2rpntlvwz0t1\[ \\t]+\[^\n\]*\\(%\[a-z0-9]*\,%\[a-z0-9\]*\,\[124\]\\)+\[^\n\]*%tmm\[0-9\]" } } */ +/* { dg-final { scan-assembler "t2rpntlvwz1\[ \\t]+\[^\n\]*\\(%\[a-z0-9]*\,%\[a-z0-9\]*\,\[124\]\\)+\[^\n\]*%tmm\[0-9\]" } } */ +/* { dg-final { scan-assembler "t2rpntlvwz1t1\[ \\t]+\[^\n\]*\\(%\[a-z0-9]*\,%\[a-z0-9\]*\,\[124\]\\)+\[^\n\]*%tmm\[0-9\]" } } */ +/* { dg-final { scan-assembler "tconjtcmmimfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +/* { dg-final { scan-assembler "tconjtfp16\[ \\t]+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +/* { dg-final { scan-assembler "ttcmmimfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +/* { dg-final { scan-assembler "ttcmmrlfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +/* { dg-final { scan-assembler "ttmmultf32ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */ +#include <immintrin.h> + +extern const void* base; +extern const int stride; + +#define TMM0 0 +#define TMM1 1 +#define TMM2 2 +#define TMM3 3 + +void TEST() +{ + _tile_tdpbf16ps (TMM1, TMM2, TMM3); + _tile_tdpfp16ps (TMM1, TMM2, TMM3); + _tile_transposed (TMM1, TMM2); + _tile_2rpntlvwz0 (TMM0, base, stride); + _tile_2rpntlvwz0t1 (TMM1, base, stride); + _tile_2rpntlvwz1 (TMM2, base, stride); + _tile_2rpntlvwz1t1 (TMM3, base, stride); + _tile_conjtcmmimfp16ps (TMM1, TMM2, TMM3); + _tile_conjtfp16 (TMM1, TMM2); + _tile_tcmmimfp16ps (TMM1, TMM2, TMM3); + _tile_tcmmrlfp16ps (TMM1, TMM2, TMM3); + _tile_tmmultf32ps (TMM1, TMM2, TMM3); +} diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-asmintel-1.c b/gcc/testsuite/gcc.target/i386/amxtranspose-asmintel-1.c new file mode 100644 index 0000000..2cf73ae --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-asmintel-1.c @@ -0,0 +1,35 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-require-effective-target masm_intel } */ +/* { dg-options "-O2 -mamx-transpose -mamx-bf16 -mamx-complex -mamx-fp16 -mamx-tf32 -masm=intel" } */ +/* { dg-final { scan-assembler "ttdpbf16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */ +/* { dg-final { scan-assembler "ttdpfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */ +/* { dg-final { scan-assembler "ttransposed\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2" } } */ +/* { dg-final { scan-assembler "t2rpntlvwz0\[ \\t]%tmm\[0-9\]" } } */ +/* { dg-final { scan-assembler "t2rpntlvwz0t1\[ \\t]%tmm\[0-9\]" } } */ +/* { dg-final { scan-assembler "t2rpntlvwz1\[ \\t]%tmm\[0-9\]" } } */ +/* { dg-final { scan-assembler "t2rpntlvwz1t1\[ \\t]%tmm\[0-9\]" } } */ +/* { dg-final { scan-assembler "tconjtcmmimfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */ +/* { dg-final { scan-assembler "tconjtfp16\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2" } } */ +/* { dg-final { scan-assembler "ttcmmimfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */ +/* { dg-final { scan-assembler "ttcmmrlfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */ +/* { dg-final { scan-assembler "ttmmultf32ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */ +#include <immintrin.h> + +extern const void* base; +extern const int stride; + +void TEST() +{ + _tile_tdpbf16ps (1, 2, 3); + _tile_tdpfp16ps (1, 2, 3); + _tile_transposed (1, 2); + _tile_2rpntlvwz0 (5, base, stride); + _tile_2rpntlvwz0t1 (4, base, stride); + _tile_2rpntlvwz1 (3, base, stride); + _tile_2rpntlvwz1t1 (2, base, stride); + _tile_conjtcmmimfp16ps (1, 2, 3); + _tile_conjtfp16 (1, 2); + _tile_tcmmimfp16ps (1, 2, 3); + _tile_tcmmrlfp16ps (1, 2, 3); + _tile_tmmultf32ps (1, 2, 3); +} diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-conjtcmmimfp16ps-2.c b/gcc/testsuite/gcc.target/i386/amxtranspose-conjtcmmimfp16ps-2.c new file mode 100644 index 0000000..159867d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-conjtcmmimfp16ps-2.c @@ -0,0 +1,55 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_transpose } */ +/* { dg-require-effective-target amx_complex } */ +/* { dg-require-effective-target avx512fp16 } */ +/* { dg-options "-O2 -mamx-transpose -mamx-complex -mavx512fp16" } */ +#define AMX_TRANSPOSE +#define AMX_COMPLEX +#define DO_TEST test_amx_transpose_conjtcmmimfp16ps +void test_amx_transpose_conjtcmmimfp16ps (); +#include "amx-helper.h" + +void calc_matrix_conjtcmmimfp16ps (__tile *dst, __tile *src1, __tile *src2) +{ + uint16_t *src1_buf = (uint16_t *) src1->buf; + uint16_t *src2_buf = (uint16_t *) src2->buf; + float *dst_buf = (float *) dst->buf; + + int K = src1->rows; + int M = src1->colsb / 4; + int N = src2->colsb / 4; + int m, k, n, t; + + for (m = 0; m < M; m++) + for (k = 0; k < K; k++) + for (n = 0; n < N; n++) + for (t = 0; t < 2; t+=2) + dst_buf[m * N + n] += + (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t]) * + make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t + 1])) - + (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t + 1]) * + make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t])); +} + +void test_amx_transpose_conjtcmmimfp16ps () +{ + __tilecfg_u cfg; + __tile dst, dst_ref, src1, src2; + uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024]; + + init_fp16_max_tile_buffer (tmp_dst_buf); + init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf); + + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf); + init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf); + init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf); + + calc_matrix_conjtcmmimfp16ps (&dst, &src1, &src2); + + _tile_conjtcmmimfp16ps (1, 2, 3); + _tile_stored (1, dst_ref.buf, _STRIDE); + + if (!check_tile_register (&dst_ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-conjtfp16-2.c b/gcc/testsuite/gcc.target/i386/amxtranspose-conjtfp16-2.c new file mode 100644 index 0000000..710d76a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-conjtfp16-2.c @@ -0,0 +1,48 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_transpose } */ +/* { dg-require-effective-target amx_complex } */ +/* { dg-require-effective-target avx512fp16 } */ +/* { dg-options "-O2 -mamx-transpose -mamx-complex -mavx512fp16" } */ +#define AMX_TRANSPOSE +#define AMX_COMPLEX +#define DO_TEST test_amx_transpose_conjtfp16 +void test_amx_transpose_conjtfp16 (); +#include "amx-helper.h" + +void calc_matrix_conjtfp16 (__tile *dst, __tile *src) +{ + uint16_t *src_buf = (uint16_t *) src->buf; + float *dst_buf = (float *) dst->buf; + + int M = dst->rows; + int N = dst->colsb / 4; + int i, j, t; + + for (i = 0; i < M; i++) + for (j = 0; j < N; j++) + for (t = 0; t < 2; t+=2) + { + dst_buf[i * 2 * N + 2 * j + t] = src_buf[j * 2 * M + 2 * i + t]; + dst_buf[i * 2 * N + 2 * j + t + 1] = -src_buf[j * 2 * M + 2 * i + t + 1]; + } +} + +void test_amx_transpose_conjtfp16 () +{ + __tilecfg_u cfg; + __tile src, dst, ref; + uint8_t tmp_dst_buf[1024]; + + init_fp16_max_tile_buffer (tmp_dst_buf); + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_buf); + init_tile_reg_and_src_with_buffer (2, src, tmp_dst_buf); + + /* Check tconjtfp16. */ + calc_matrix_conjtfp16 (&dst, &src); + _tile_conjtfp16 (1, 2); + _tile_stored (1, ref.buf, _STRIDE); + + if (!check_tile_register (&ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-tcmmimfp16ps-2.c b/gcc/testsuite/gcc.target/i386/amxtranspose-tcmmimfp16ps-2.c new file mode 100644 index 0000000..e2a0f10 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-tcmmimfp16ps-2.c @@ -0,0 +1,55 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_transpose } */ +/* { dg-require-effective-target amx_complex } */ +/* { dg-require-effective-target avx512fp16 } */ +/* { dg-options "-O2 -mamx-transpose -mamx-complex -mavx512fp16" } */ +#define AMX_TRANSPOSE +#define AMX_COMPLEX +#define DO_TEST test_amx_transpose_tcmmimfp16ps +void test_amx_transpose_tcmmimfp16ps (); +#include "amx-helper.h" + +void calc_matrix_tcmmimfp16ps (__tile *dst, __tile *src1, __tile *src2) +{ + uint16_t *src1_buf = (uint16_t *) src1->buf; + uint16_t *src2_buf = (uint16_t *) src2->buf; + float *dst_buf = (float *) dst->buf; + + int K = src1->rows; + int M = src1->colsb / 4; + int N = src2->colsb / 4; + int m, k, n, t; + + for (m = 0; m < M; m++) + for (k = 0; k < K; k++) + for (n = 0; n < N; n++) + for (t = 0; t < 2; t+=2) + dst_buf[m * N + n] += + (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t]) * + make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t + 1])) + + (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t + 1]) * + make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t])); +} + +void test_amx_transpose_tcmmimfp16ps () +{ + __tilecfg_u cfg; + __tile dst, dst_ref, src1, src2; + uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024]; + + init_fp16_max_tile_buffer (tmp_dst_buf); + init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf); + + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf); + init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf); + init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf); + + calc_matrix_tcmmimfp16ps (&dst, &src1, &src2); + + _tile_tcmmimfp16ps (1, 2, 3); + _tile_stored (1, dst_ref.buf, _STRIDE); + + if (!check_tile_register (&dst_ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-tcmmrlfp16ps-2.c b/gcc/testsuite/gcc.target/i386/amxtranspose-tcmmrlfp16ps-2.c new file mode 100644 index 0000000..b09186c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-tcmmrlfp16ps-2.c @@ -0,0 +1,55 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_transpose } */ +/* { dg-require-effective-target amx_complex } */ +/* { dg-require-effective-target avx512fp16 } */ +/* { dg-options "-O2 -mamx-transpose -mamx-complex -mavx512fp16" } */ +#define AMX_TRANSPOSE +#define AMX_COMPLEX +#define DO_TEST test_amx_transpose_tcmmrlfp16ps +void test_amx_transpose_tcmmrlfp16ps (); +#include "amx-helper.h" + +void calc_matrix_tcmmrlfp16ps (__tile *dst, __tile *src1, __tile *src2) +{ + uint16_t *src1_buf = (uint16_t *) src1->buf; + uint16_t *src2_buf = (uint16_t *) src2->buf; + float *dst_buf = (float *) dst->buf; + + int K = src1->rows; + int M = src1->colsb / 4; + int N = src2->colsb / 4; + int m, k, n, t; + + for (m = 0; m < M; m++) + for (k = 0; k < K; k++) + for (n = 0; n < N; n++) + for (t = 0; t < 2; t+=2) + dst_buf[m * N + n] += + (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t]) * + make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t])) - + (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t + 1]) * + make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t + 1])); +} + +void test_amx_transpose_tcmmrlfp16ps () +{ + __tilecfg_u cfg; + __tile dst, dst_ref, src1, src2; + uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024]; + + init_fp16_max_tile_buffer (tmp_dst_buf); + init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf); + + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf); + init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf); + init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf); + + calc_matrix_tcmmrlfp16ps (&dst, &src1, &src2); + + _tile_tcmmrlfp16ps (1, 2, 3); + _tile_stored (1, dst_ref.buf, _STRIDE); + + if (!check_tile_register (&dst_ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-tdpbf16ps-2.c b/gcc/testsuite/gcc.target/i386/amxtranspose-tdpbf16ps-2.c new file mode 100644 index 0000000..6a3226b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-tdpbf16ps-2.c @@ -0,0 +1,53 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_transpose } */ +/* { dg-require-effective-target amx_bf16 } */ +/* { dg-options "-O2 -mamx-transpose -mamx-bf16 -mavx512bf16" } */ +#define AMX_TRANSPOSE +#define AMX_BF16 +#define DO_TEST test_amx_transpose_tdpbf16ps +void test_amx_transpose_tdpbf16ps (); +#include "amx-helper.h" + +void calc_matrix_tdpbf16ps(__tile *dst, __tile *src1, __tile *src2) +{ + uint16_t *src1_buf = (uint16_t *) src1->buf; + uint16_t *src2_buf = (uint16_t *) src2->buf; + float *dst_buf = (float *) dst->buf; + + int K = src1->rows; + int M = src1->colsb / 4; + int N = src2->colsb / 4; + int m, k, n, t; + + for (m = 0; m < M; m++) + for (k = 0; k < K; k++) + for (n = 0; n < N; n++) + for (t = 0; t < 2; t+=2) + dst_buf[m * N + n] += + (make_bf16_f32 (src1_buf[k * 2 * M + 2 * m + t]) * + make_bf16_f32 (src2_buf[k * 2 * N + 2 * n + t])) + + (make_bf16_f32 (src1_buf[k * 2 * M + 2 * m + t + 1]) * + make_bf16_f32 (src2_buf[k * 2 * N + 2 * n + t + 1])); +} + +void test_amx_transpose_tdpbf16ps () +{ + __tilecfg_u cfg; + __tile dst, dst_ref, src1, src2; + uint8_t tmp_dst_buf[1024]; + + init_bf16_max_tile_buffer (tmp_dst_buf); + + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_buf); + init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf); + init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf); + + calc_matrix_tdpbf16ps (&dst, &src1, &src2); + + _tile_tdpbf16ps (1, 2, 3); + _tile_stored (1, dst_ref.buf, _STRIDE); + + if (!check_float_tile_register (&dst_ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-tdpfp16ps-2.c b/gcc/testsuite/gcc.target/i386/amxtranspose-tdpfp16ps-2.c new file mode 100644 index 0000000..83c3715 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-tdpfp16ps-2.c @@ -0,0 +1,55 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_transpose } */ +/* { dg-require-effective-target amx_fp16 } */ +/* { dg-require-effective-target avx512fp16 } */ +/* { dg-options "-O2 -mamx-transpose -mamx-fp16 -mavx512fp16" } */ +#define AMX_TRANSPOSE +#define AMX_FP16 +#define DO_TEST test_amx_transpose_tdpfp16ps +void test_amx_transpose_tdpfp16ps (); +#include "amx-helper.h" + +void calc_matrix_tdpfp16ps(__tile *dst, __tile *src1, __tile *src2) +{ + uint16_t *src1_buf = (uint16_t *) src1->buf; + uint16_t *src2_buf = (uint16_t *) src2->buf; + float *dst_buf = (float *) dst->buf; + + int K = src1->rows; + int M = src1->colsb / 4; + int N = src2->colsb / 4; + int m, k, n, t; + + for (m = 0; m < M; m++) + for (k = 0; k < K; k++) + for (n = 0; n < N; n++) + for (t = 0; t < 2; t+=2) + dst_buf[m * N + n] += + (make_fp16_f32 (src1_buf[k * 2 * M + 2 * m + t]) * + make_fp16_f32 (src2_buf[k * 2 * N + 2 * n + t])) + + (make_fp16_f32 (src1_buf[k * 2 * M + 2 * m + t + 1]) * + make_fp16_f32 (src2_buf[k * 2 * N + 2 * n + t + 1])); +} + +void test_amx_transpose_tdpfp16ps () +{ + __tilecfg_u cfg; + __tile dst, dst_ref, src1, src2; + uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024]; + + init_fp16_max_tile_buffer (tmp_dst_buf); + init_fp16_max_tile_zero_buffer(tmp_dst_zero_buf); + + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf); + init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf); + init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf); + + calc_matrix_tdpfp16ps (&dst, &src1, &src2); + + _tile_tdpfp16ps (1, 2, 3); + _tile_stored (1, dst_ref.buf, _STRIDE); + + if (!check_float_tile_register (&dst_ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-tmmultf32ps-2.c b/gcc/testsuite/gcc.target/i386/amxtranspose-tmmultf32ps-2.c new file mode 100644 index 0000000..44166c1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-tmmultf32ps-2.c @@ -0,0 +1,51 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_transpose } */ +/* { dg-require-effective-target amx_tf32 } */ +/* { dg-options "-O2 -mamx-transpose -mamx-tf32" } */ +#define AMX_TRANSPOSE +#define AMX_TF32 +#define DO_TEST test_amx_transpose_tmmultf32ps +void test_amx_transpose_tmmultf32ps(); +#include "amx-helper.h" + +void calc_matrix_tmmultf32ps(__tile *dst, __tile *src1, __tile *src2) +{ + float *src1_buf = (float *) src1->buf; + float *src2_buf = (float *) src2->buf; + float *dst_buf = (float *) dst->buf; + + int K = src1->rows; + int M = src1->colsb / 4; + int N = src2->colsb / 4; + int m, n, k; + + for (m = 0; m < M; m++) + for (k = 0; k < K; k++) + for (n = 0; n < N; n++) + dst_buf[m * N + n] += + zero_lower_mantissa_bits_fp32 (silence_snan_fp32 (src1_buf[k * M + m])) * + zero_lower_mantissa_bits_fp32 (silence_snan_fp32 (src2_buf[k * N + n])); + +} + +void test_amx_transpose_tmmultf32ps () +{ + __tilecfg_u cfg; + __tile dst, dst_ref, src1, src2; + uint8_t tmp_dst_buf[1024]; + + init_fp32_max_tile_buffer (tmp_dst_buf); + + init_tile_config (&cfg); + init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_buf); + init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf); + init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf); + + calc_matrix_tmmultf32ps (&dst, &src1, &src2); + + _tile_tmmultf32ps (1, 2, 3); + _tile_stored (1, dst_ref.buf, _STRIDE); + + if (!check_tile_register (&dst_ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/amxtranspose-transposed-2.c b/gcc/testsuite/gcc.target/i386/amxtranspose-transposed-2.c new file mode 100644 index 0000000..73c709c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/amxtranspose-transposed-2.c @@ -0,0 +1,39 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target amx_transpose } */ +/* { dg-options "-O2 -mamx-transpose" } */ +#define AMX_TRANSPOSE +#define DO_TEST test_amx_transpose_transposed +void test_amx_transpose_transposed (); +#include "amx-helper.h" + +void calc_matrix_ttransposed (__tile *dst, __tile *src) +{ + uint32_t *src_buf = (uint32_t *) src->buf; + uint32_t *dst_buf = (uint32_t *) dst->buf; + + int M = src->rows; + int N = src->colsb / 4; + int i, j; + + for (i = 0; i < M; i++) + for (j = 0; j < N; j++) + dst_buf[j * M + i] = (uint32_t) src_buf[i * N + j]; +} + +void test_amx_transpose_transposed () +{ + __tilecfg_u cfg; + __tile src, dst, ref; + + init_tile_config (&cfg); + init_tile_reg_and_src (1, dst); + init_tile_reg_and_src (2, src); + + /* Check ttransposed. */ + calc_matrix_ttransposed (&dst, &src); + _tile_transposed (1, 2); + _tile_stored (1, ref.buf, _STRIDE); + + if (!check_tile_register (&ref, &dst)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc index 1ad4c1e..5a977ff 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc +++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc @@ -91,6 +91,7 @@ extern void test_avx10_2 (void) __attribute__((__target__("avx10.2"))); extern void test_avx10_2_512 (void) __attribute__((__target__("avx10.2-512"))); extern void test_amx_avx512 (void) __attribute__((__target__("amx-avx512"))); extern void test_amx_tf32 (void) __attribute__((__target__("amx-tf32"))); +extern void test_amx_transpose (void) __attribute__((__target__("amx-transpose"))); extern void test_no_sgx (void) __attribute__((__target__("no-sgx"))); extern void test_no_avx512vpopcntdq(void) __attribute__((__target__("no-avx512vpopcntdq"))); @@ -183,6 +184,7 @@ extern void test_no_avx10_2 (void) __attribute__((__target__("no-avx10.2"))); extern void test_no_avx10_2_512 (void) __attribute__((__target__("no-avx10.2-512"))); extern void test_no_amx_avx512 (void) __attribute__((__target__("no-amx-avx512"))); extern void test_no_amx_tf32 (void) __attribute__((__target__("no-amx-tf32"))); +extern void test_no_amx_transpose (void) __attribute__((__target__("no-amx-transpose"))); extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona"))); extern void test_arch_core2 (void) __attribute__((__target__("arch=core2"))); diff --git a/gcc/testsuite/gcc.target/i386/sse-12.c b/gcc/testsuite/gcc.target/i386/sse-12.c index 7688ec3..d13b606 100644 --- a/gcc/testsuite/gcc.target/i386/sse-12.c +++ b/gcc/testsuite/gcc.target/i386/sse-12.c @@ -3,7 +3,7 @@ popcntintrin.h gfniintrin.h and mm_malloc.h are usable with -O -std=c89 -pedantic-errors. */ /* { dg-do compile } */ -/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32" } */ +/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */ #include <x86intrin.h> diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c index c897b1a..b24b513 100644 --- a/gcc/testsuite/gcc.target/i386/sse-13.c +++ b/gcc/testsuite/gcc.target/i386/sse-13.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32" } */ +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */ /* { dg-add-options bind_pic_locally } */ #include <mm_malloc.h> diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c index 4866df3f..03d21dc 100644 --- a/gcc/testsuite/gcc.target/i386/sse-14.c +++ b/gcc/testsuite/gcc.target/i386/sse-14.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32" } */ +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */ /* { dg-add-options bind_pic_locally } */ #include <mm_malloc.h> diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c index 5d95a8b..7026d03 100644 --- a/gcc/testsuite/gcc.target/i386/sse-22.c +++ b/gcc/testsuite/gcc.target/i386/sse-22.c @@ -103,7 +103,7 @@ #ifndef DIFFERENT_PRAGMAS -#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32") +#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose") #endif /* Following intrinsics require immediate arguments. They @@ -220,7 +220,7 @@ test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1) /* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */ #ifdef DIFFERENT_PRAGMAS -#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32") +#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose") #endif #include <immintrin.h> test_1 (_cvtss_sh, unsigned short, float, 1) diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c index edd1e1c..88eabdf 100644 --- a/gcc/testsuite/gcc.target/i386/sse-23.c +++ b/gcc/testsuite/gcc.target/i386/sse-23.c @@ -1082,6 +1082,6 @@ #define __builtin_ia32_minmaxps128_mask(A, B, C, D, E) __builtin_ia32_minmaxps128_mask (A, B, 100, D, E) #define __builtin_ia32_minmaxps256_mask_round(A, B, C, D, E, F) __builtin_ia32_minmaxps256_mask_round (A, B, 100, D, E, 4) -#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32") +#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose") #include <x86intrin.h> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index dd166eb..244d2d0 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -10797,6 +10797,17 @@ proc check_effective_target_amx_tf32 { } { } "-mamx-tf32" ] } +# Return 1 if amx-transpose instructions can be compiled. +proc check_effective_target_amx_transpose { } { + return [check_no_compiler_messages amx_transpose object { + void + foo () + { + __asm__ volatile ("ttransposed\t%%tmm1, %%tmm2" ::); + } + } "-mamx-transpose" ] +} + # Return 1 if sse instructions can be compiled. proc check_effective_target_sse { } { return [check_no_compiler_messages sse object { |