Guard truncate from vector float to vector __bf16 with !flag_rounding_math && HONOR_NANS (BFmode).

hw instruction doesn't raise exceptions, turns sNAN into qNAN quietly, and always round to nearest (even). Output denormals are always flushed to zero and input denormals are always treated as zero. MXCSR is not consulted nor updated. W/o native instructions, flag_unsafe_math_optimizations is needed for the permutation instructions. Similar guard extend from vector __bf16 to vector float with !HONOR_NANS (BFmode). gcc/ChangeLog: * config/i386/i386.md (truncsf2bf2): Add !flag_rounding_math to the condition, require flag_unsafe_math_optimizations when native instruction is not available. * config/i386/mmx.md: (truncv2sfv2bf2): Ditto. (extendv2bfv2sf2): Add !HONOR_NANS (BFmode) to the condition. * config/i386/sse.md: (truncv4sfv4sf2): Add !flag_rounding_math to the condition, require flag_unsafe_math_optimizations when native instruction is not available. (truncv8sfv8bf2): Ditto. (truncv16sfv16bf2): Ditto. (extendv4bfv4sf2): Add !HONOR_NANS (BFmode) to the condition. (extendv8bfv8sf2): Ditto. (extendv16bfv16sf2): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512bf16-truncsfbf.c: Add -ffast-math. * gcc.target/i386/avx512bw-extendbf2sf.c: Ditto. * gcc.target/i386/avx512bw-truncsfbf.c: Ditto. * gcc.target/i386/sse2-extendbf2sf.c: Ditto. * gcc.target/i386/ssse3-truncsfbf.c: Ditto.
author: liuhongt <hongtao.liu@intel.com> 2024-11-06 18:15:42 -0800
committer: liuhongt <hongtao.liu@intel.com> 2024-11-10 18:20:23 -0800
commit: de867e8da30bf5e0cb51c3946ec43c3c4778d4a0 (patch)
tree: 0fdb1dfc26d2e7121de3e339ea2abc63fcfaf45d
parent: ca1cff0c924dfce7d7792dbeab978bbbf65df0fa (diff)
download: gcc-de867e8da30bf5e0cb51c3946ec43c3c4778d4a0.zip
gcc-de867e8da30bf5e0cb51c3946ec43c3c4778d4a0.tar.gz
gcc-de867e8da30bf5e0cb51c3946ec43c3c4778d4a0.tar.bz2
8 files changed, 33 insertions, 12 deletions
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 34bc046..f4aae80 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5698,11 +5698,20 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
 
+/* vcvtneps2bf16 doesn't honor SNAN, and turn sNAN into qNAN quietly,
+   and it always round to even.
+   flag_unsafte_math_optimization is needed for psrld.
+   If we don't expect qNaNs nor sNaNs and can assume rounding
+   to nearest, we can expand the conversion inline as
+   (fromi + 0x7fff + ((fromi >> 16) & 1)) >> 16.  */
 (define_insn "truncsfbf2"
   [(set (match_operand:BF 0 "register_operand" "=x,x,v,Yv")
 	(float_truncate:BF
 	  (match_operand:SF 1 "register_operand" "0,x,v,Yv")))]
-  "TARGET_SSE2 && flag_unsafe_math_optimizations && !HONOR_NANS (BFmode)"
+  "TARGET_SSE2 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+       || TARGET_AVXNECONVERT
+       || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
   "@
   psrld\t{$16, %0|%0, 16}
   %{vex%} vcvtneps2bf16\t{%1, %0|%0, %1}
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 021ac90..61a4f4d 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2998,7 +2998,11 @@
   [(set (match_operand:V2BF 0 "register_operand")
 	(float_truncate:V2BF
 	  (match_operand:V2SF 1 "nonimmediate_operand")))]
-  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
+  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE
+  && !HONOR_NANS (BFmode) && !flag_rounding_math
+  && (flag_unsafe_math_optimizations
+      || TARGET_AVXNECONVERT
+      || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   rtx op1 = gen_reg_rtx (V4SFmode);
   rtx op0 = gen_reg_rtx (V4BFmode);
@@ -3016,7 +3020,7 @@
   [(set (match_operand:V2SF 0 "register_operand")
 	(float_extend:V2SF
 	  (match_operand:V2BF 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_MMX_WITH_SSE"
+  "TARGET_SSE2 && TARGET_MMX_WITH_SSE && !HONOR_NANS (BFmode)"
 {
   rtx op0 = gen_reg_rtx (V4SFmode);
   rtx op1 = gen_reg_rtx (V4BFmode);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5eeb3ab..efe32e5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30995,7 +30995,10 @@
   [(set (match_operand:V4BF 0 "register_operand")
 	  (float_truncate:V4BF
 	    (match_operand:V4SF 1 "nonimmediate_operand")))]
-  "TARGET_SSSE3"
+  "TARGET_SSSE3 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+       || TARGET_AVXNECONVERT
+       || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   if (!TARGET_AVXNECONVERT
       && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
@@ -31088,7 +31091,10 @@
   [(set (match_operand:V8BF 0 "register_operand")
 	(float_truncate:V8BF
 	  (match_operand:V8SF 1 "nonimmediate_operand")))]
-  "TARGET_AVX2"
+  "TARGET_AVX2 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+       || TARGET_AVXNECONVERT
+       || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   if (!TARGET_AVXNECONVERT
       && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
@@ -31114,7 +31120,9 @@
   [(set (match_operand:V16BF 0 "register_operand")
 	(float_truncate:V16BF
 	  (match_operand:V16SF 1 "nonimmediate_operand")))]
-  "TARGET_AVX512BW && TARGET_EVEX512"
+  "TARGET_AVX512BW && TARGET_EVEX512
+   && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations || TARGET_AVX512BF16)"
 {
   if (!TARGET_AVX512BF16)
     {
@@ -31127,7 +31135,7 @@
   [(set (match_operand:VF1_AVX512BW 0 "register_operand")
 	(float_extend:VF1_AVX512BW
 	  (match_operand:<sf_cvt_bf16> 1 "nonimmediate_operand")))]
-  "TARGET_SSE2"
+  "TARGET_SSE2 && !HONOR_NANS (BFmode)"
 {
   ix86_expand_vector_bf2sf_with_vec_perm (operands[0], operands[1]);
   DONE;
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
index da31bdb..1b4b62f 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512vl -mavx512bf16 -O2" } */
+/* { dg-options "-mavx512vl -mavx512bf16 -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)vcvtneps2bf16} 6 } } */
 
 #include "avx512bw-truncsfbf.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c b/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c
index 5b59958..e7c65b7 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512bw -mavx512vl -O2" } */
+/* { dg-options "-mavx512bw -mavx512vl -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|vpunpcklwd)} 6 } } */
 
 typedef float v4sf __attribute__((vector_size(16)));
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
index 071db21..40802d8 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)(?:vpermw|vpshufb)} 6 } } */
 
 typedef float v4sf __attribute__((vector_size(16)));
diff --git a/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c b/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c
index 0f007df..d7f77ac 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-msse2 -O2" } */
+/* { dg-options "-msse2 -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|punpcklwd)} 2 { target { ! ia32 } } } } */
 
 typedef float v2sf __attribute__((vector_size(8)));
diff --git a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
index 70840c5..af92f4d 100644
--- a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
+++ b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)pshufb} 2 { target { ! ia32 } } } } */
 
 typedef float v2sf __attribute__((vector_size(8)));
author	liuhongt <hongtao.liu@intel.com>	2024-11-06 18:15:42 -0800
committer	liuhongt <hongtao.liu@intel.com>	2024-11-10 18:20:23 -0800
commit	de867e8da30bf5e0cb51c3946ec43c3c4778d4a0 (patch)
tree	0fdb1dfc26d2e7121de3e339ea2abc63fcfaf45d
parent	ca1cff0c924dfce7d7792dbeab978bbbf65df0fa (diff)
download	gcc-de867e8da30bf5e0cb51c3946ec43c3c4778d4a0.zip gcc-de867e8da30bf5e0cb51c3946ec43c3c4778d4a0.tar.gz gcc-de867e8da30bf5e0cb51c3946ec43c3c4778d4a0.tar.bz2