aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2024-10-23 00:51:00 -0700
committerliuhongt <hongtao.liu@intel.com>2024-11-05 00:20:04 -0800
commita17acf4f25f0ce9b8dce24f25867500a3b093b57 (patch)
treefa7831c18df8e9eaeb9494e6410cab9d4da3de87
parentc1bbad07c8686c858ea58ffdb9db8f964bf485c6 (diff)
downloadgcc-a17acf4f25f0ce9b8dce24f25867500a3b093b57.zip
gcc-a17acf4f25f0ce9b8dce24f25867500a3b093b57.tar.gz
gcc-a17acf4f25f0ce9b8dce24f25867500a3b093b57.tar.bz2
Support vector float_truncate for SF to BF.
Generate native instruction whenever possible, otherwise use vector permutation with odd indices. gcc/ChangeLog: * config/i386/i386-expand.cc (ix86_expand_vector_sf2bf_with_vec_perm): New function. * config/i386/i386-protos.h (ix86_expand_vector_sf2bf_with_vec_perm): New declare. * config/i386/mmx.md (truncv2sfv2bf2): New expander. * config/i386/sse.md (truncv4sfv4bf2): Ditto. (truncv8sfv8bf2): Ditto. (truncv16sfv16bf2): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512bf16-truncsfbf.c: New test. * gcc.target/i386/avx512bw-truncsfbf.c: New test. * gcc.target/i386/ssse3-truncsfbf.c: New test.
-rw-r--r--gcc/config/i386/i386-expand.cc38
-rw-r--r--gcc/config/i386/i386-protos.h1
-rw-r--r--gcc/config/i386/mmx.md18
-rw-r--r--gcc/config/i386/sse.md44
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c5
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c46
-rw-r--r--gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c20
7 files changed, 172 insertions, 0 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index ff07ab4..5ee9973 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -26842,4 +26842,42 @@ ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_m
emit_move_insn (output, gen_lowpart (out_mode, d.target));
}
+/* Implement truncv8sfv8bf2 with vector permutation. */
+void
+ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
+{
+ machine_mode vperm_mode, src_mode = GET_MODE (src);
+ switch (src_mode)
+ {
+ case V16SFmode:
+ vperm_mode = V32BFmode;
+ break;
+ case V8SFmode:
+ vperm_mode = V16BFmode;
+ break;
+ case V4SFmode:
+ vperm_mode = V8BFmode;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ int nelt = GET_MODE_NUNITS (vperm_mode);
+ vec_perm_builder sel (nelt, nelt, 1);
+ sel.quick_grow (nelt);
+ for (int i = 0; i != nelt; i++)
+ sel[i] = (2 * i + 1) % nelt;
+ vec_perm_indices indices (sel, 1, nelt);
+
+ rtx target = gen_reg_rtx (vperm_mode);
+ rtx op0 = lowpart_subreg (vperm_mode,
+ force_reg (src_mode, src),
+ src_mode);
+ bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
+ target, op0, op0, indices);
+ gcc_assert (ok);
+ emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
+}
+
+
#include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index c1f9147..55ffdb9 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -258,6 +258,7 @@ extern int ix86_ternlog_idx (rtx op, rtx *args);
extern bool ix86_ternlog_operand_p (rtx op);
extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2,
int idx, rtx target);
+extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx);
#ifdef TREE_CODE
extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 506f4ca..5c776ec 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2994,6 +2994,24 @@
DONE;
})
+(define_expand "truncv2sfv2bf2"
+ [(set (match_operand:V2BF 0 "register_operand")
+ (float_truncate:V2BF
+ (match_operand:V2SF 1 "nonimmediate_operand")))]
+ "TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
+{
+ rtx op1 = gen_reg_rtx (V4SFmode);
+ rtx op0 = gen_reg_rtx (V4BFmode);
+
+ emit_move_insn (op1, lowpart_subreg (V4SFmode,
+ force_reg (V2SFmode, operands[1]),
+ V2SFmode));
+ emit_insn (gen_truncv4sfv4bf2 (op0, op1));
+
+ emit_move_insn (operands[0], lowpart_subreg (V2BFmode, op0, V4BFmode));
+ DONE;
+})
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Parallel integral arithmetic
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 15ed8ff..4859279 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30984,6 +30984,24 @@
"TARGET_AVX512BF16"
"vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
+(define_expand "truncv4sfv4bf2"
+ [(set (match_operand:V4BF 0 "register_operand")
+ (float_truncate:V4BF
+ (match_operand:V4SF 1 "nonimmediate_operand")))]
+ "TARGET_SSSE3"
+{
+ if (!TARGET_AVXNECONVERT
+ && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
+ ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
+ else
+ {
+ rtx dest = gen_reg_rtx (V8BFmode);
+ emit_insn (gen_vcvtneps2bf16_v4sf (dest, operands[1]));
+ emit_move_insn (operands[0], lowpart_subreg (V4BFmode, dest, V8BFmode));
+ }
+ DONE;
+})
+
(define_expand "vcvtneps2bf16_v4sf"
[(set (match_operand:V8BF 0 "register_operand")
(vec_concat:V8BF
@@ -31059,6 +31077,20 @@
DONE;
})
+(define_expand "truncv8sfv8bf2"
+ [(set (match_operand:V8BF 0 "register_operand")
+ (float_truncate:V8BF
+ (match_operand:V8SF 1 "nonimmediate_operand")))]
+ "TARGET_AVX2"
+{
+ if (!TARGET_AVXNECONVERT
+ && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
+ {
+ ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
+ DONE;
+ }
+})
+
(define_insn "vcvtneps2bf16_v8sf"
[(set (match_operand:V8BF 0 "register_operand" "=x,v")
(float_truncate:V8BF
@@ -31071,6 +31103,18 @@
(set_attr "addr" "gpr16,*")
(set_attr "prefix" "vex,evex")])
+(define_expand "truncv16sfv16bf2"
+ [(set (match_operand:V16BF 0 "register_operand")
+ (float_truncate:V16BF
+ (match_operand:V16SF 1 "nonimmediate_operand")))]
+ "TARGET_AVX512BW && TARGET_EVEX512"
+{
+ if (!TARGET_AVX512BF16)
+ {
+ ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
+ DONE;
+ }
+})
(define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
[(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
new file mode 100644
index 0000000..da31bdb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bf16 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vcvtneps2bf16} 6 } } */
+
+#include "avx512bw-truncsfbf.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
new file mode 100644
index 0000000..071db21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-final { scan-assembler-times {(?n)(?:vpermw|vpshufb)} 6 } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v16sf __attribute__((vector_size(64)));
+typedef __bf16 v4bf __attribute__((vector_size(8)));
+typedef __bf16 v8bf __attribute__((vector_size(16)));
+typedef __bf16 v16bf __attribute__((vector_size(32)));
+
+v4bf
+foo (v4sf b, v4sf a)
+{
+ return __builtin_convertvector (a, v4bf);
+}
+
+v8bf
+foo2 (v8sf b, v8sf a)
+{
+ return __builtin_convertvector (a, v8bf);
+}
+
+v16bf
+foo3 (v16sf b, v16sf a)
+{
+ return __builtin_convertvector (a, v16bf);
+}
+
+v4bf
+foo_mem (v4sf* a)
+{
+ return __builtin_convertvector (*a, v4bf);
+}
+
+v8bf
+foo2_mem (v8sf* a)
+{
+ return __builtin_convertvector (*a, v8bf);
+}
+
+v16bf
+foo3_mem (v16sf* a)
+{
+ return __builtin_convertvector (*a, v16bf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
new file mode 100644
index 0000000..70840c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-final { scan-assembler-times {(?n)pshufb} 2 { target { ! ia32 } } } } */
+
+typedef float v2sf __attribute__((vector_size(8)));
+typedef __bf16 v2bf __attribute__((vector_size(4)));
+
+v2bf
+foo (v2sf b, v2sf a)
+{
+ return __builtin_convertvector (a, v2bf);
+}
+
+
+v2bf
+foo_mem (v2sf* a)
+{
+ return __builtin_convertvector (*a, v2bf);
+}
+