aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2021-10-25 15:20:35 +0800
committerliuhongt <hongtao.liu@intel.com>2021-10-29 09:45:29 +0800
commit84bcefd5555af6d95e08cd980965098961289215 (patch)
treedf515785707d201789b83ff112559f268df0c449 /gcc
parent2322c8b1b4429e85aa1caa6c6bbc46bb41e80178 (diff)
downloadgcc-84bcefd5555af6d95e08cd980965098961289215.zip
gcc-84bcefd5555af6d95e08cd980965098961289215.tar.gz
gcc-84bcefd5555af6d95e08cd980965098961289215.tar.bz2
Enable vectorization for _Float16 floor/ceil/trunc/nearbyint/rint operations.
gcc/ChangeLog: PR target/102464 * config/i386/i386-builtin-types.def (V8HF_FTYPE_V8HF): New function type. (V16HF_FTYPE_V16HF): Ditto. (V32HF_FTYPE_V32HF): Ditto. (V8HF_FTYPE_V8HF_ROUND): Ditto. (V16HF_FTYPE_V16HF_ROUND): Ditto. (V32HF_FTYPE_V32HF_ROUND): Ditto. * config/i386/i386-builtin.def ( IX86_BUILTIN_FLOORPH, IX86_BUILTIN_CEILPH, IX86_BUILTIN_TRUNCPH, IX86_BUILTIN_FLOORPH256, IX86_BUILTIN_CEILPH256, IX86_BUILTIN_TRUNCPH256, IX86_BUILTIN_FLOORPH512, IX86_BUILTIN_CEILPH512, IX86_BUILTIN_TRUNCPH512): New builtin. * config/i386/i386-builtins.c (ix86_builtin_vectorized_function): Enable vectorization for HFmode FLOOR/CEIL/TRUNC operation. * config/i386/i386-expand.c (ix86_expand_args_builtin): Handle new builtins. * config/i386/sse.md (rint<mode>2, nearbyint<mode>2): Extend to vector HFmodes. gcc/testsuite/ChangeLog: * gcc.target/i386/pr102464-vrndscaleph.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386-builtin-types.def7
-rw-r--r--gcc/config/i386/i386-builtin.def11
-rw-r--r--gcc/config/i386/i386-builtins.c42
-rw-r--r--gcc/config/i386/i386-expand.c3
-rw-r--r--gcc/config/i386/sse.md12
-rw-r--r--gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c115
6 files changed, 184 insertions, 6 deletions
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 4c355c5..e33f06a 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1380,3 +1380,10 @@ DEF_FUNCTION_TYPE (USI, V32HF, V32HF, INT, USI, INT)
DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, UHI, INT)
DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, V32HF, USI, INT)
DEF_FUNCTION_TYPE (V32HF, V32HF, INT, V32HF, USI, INT)
+
+DEF_FUNCTION_TYPE (V8HF, V8HF)
+DEF_FUNCTION_TYPE (V16HF, V16HF)
+DEF_FUNCTION_TYPE (V32HF, V32HF)
+DEF_FUNCTION_TYPE_ALIAS (V8HF_FTYPE_V8HF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V16HF_FTYPE_V16HF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V32HF_FTYPE_V32HF, ROUND)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 99217d0..d9eee3f 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -958,6 +958,10 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__buil
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_floorph", IX86_BUILTIN_FLOORPH, (enum rtx_code) ROUND_FLOOR, (int) V8HF_FTYPE_V8HF_ROUND)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_ceilph", IX86_BUILTIN_CEILPH, (enum rtx_code) ROUND_CEIL, (int) V8HF_FTYPE_V8HF_ROUND)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512fp16_rndscalev8hf, "__builtin_ia32_truncph", IX86_BUILTIN_TRUNCPH, (enum rtx_code) ROUND_TRUNC, (int) V8HF_FTYPE_V8HF_ROUND)
+
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND)
@@ -1090,6 +1094,10 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia3
BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND)
BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_floorph256", IX86_BUILTIN_FLOORPH256, (enum rtx_code) ROUND_FLOOR, (int) V16HF_FTYPE_V16HF_ROUND)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_ceilph256", IX86_BUILTIN_CEILPH256, (enum rtx_code) ROUND_CEIL, (int) V16HF_FTYPE_V16HF_ROUND)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512vl_rndscalev16hf, "__builtin_ia32_truncph256", IX86_BUILTIN_TRUNCPH256, (enum rtx_code) ROUND_TRUNC, (int) V16HF_FTYPE_V16HF_ROUND)
+
BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND)
BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND)
BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND)
@@ -1528,6 +1536,9 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_copysignv8df3, "__builtin_ia32_copy
BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_sqrtv8df2, "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF)
BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sqrtv16sf2, "__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) V16SF_FTYPE_V16SF)
BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_exp2v16sf, "__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) V16SF_FTYPE_V16SF)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_floorph512", IX86_BUILTIN_FLOORPH512, (enum rtx_code) ROUND_FLOOR, (int) V32HF_FTYPE_V32HF_ROUND)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_ceilph512", IX86_BUILTIN_CEILPH512, (enum rtx_code) ROUND_CEIL, (int) V32HF_FTYPE_V32HF_ROUND)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_truncph512", IX86_BUILTIN_TRUNCPH512, (enum rtx_code) ROUND_TRUNC, (int) V32HF_FTYPE_V32HF_ROUND)
BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_floorps512", IX86_BUILTIN_FLOORPS512, (enum rtx_code) ROUND_FLOOR, (int) V16SF_FTYPE_V16SF_ROUND)
BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_ceilps512", IX86_BUILTIN_CEILPS512, (enum rtx_code) ROUND_CEIL, (int) V16SF_FTYPE_V16SF_ROUND)
BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_roundps512, "__builtin_ia32_truncps512", IX86_BUILTIN_TRUNCPS512, (enum rtx_code) ROUND_TRUNC, (int) V16SF_FTYPE_V16SF_ROUND)
diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
index 11ce58b..0fb14b5 100644
--- a/gcc/config/i386/i386-builtins.c
+++ b/gcc/config/i386/i386-builtins.c
@@ -1652,6 +1652,20 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
else if (out_n == 16 && in_n == 16)
return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
}
+ if (out_mode == HFmode && in_mode == HFmode)
+ {
+ /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
+ under TARGET_AVX512FP16, TARGET_AVX512VL is needed here. */
+ if (out_n < 32 && !TARGET_AVX512VL)
+ break;
+
+ if (out_n == 8 && in_n == 8)
+ return ix86_get_builtin (IX86_BUILTIN_FLOORPH);
+ else if (out_n == 16 && in_n == 16)
+ return ix86_get_builtin (IX86_BUILTIN_FLOORPH256);
+ else if (out_n == 32 && in_n == 32)
+ return ix86_get_builtin (IX86_BUILTIN_FLOORPH512);
+ }
break;
CASE_CFN_CEIL:
@@ -1677,6 +1691,20 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
else if (out_n == 16 && in_n == 16)
return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
}
+ if (out_mode == HFmode && in_mode == HFmode)
+ {
+ /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
+ under TARGET_AVX512FP16, TARGET_AVX512VL is needed here. */
+ if (out_n < 32 && !TARGET_AVX512VL)
+ break;
+
+ if (out_n == 8 && in_n == 8)
+ return ix86_get_builtin (IX86_BUILTIN_CEILPH);
+ else if (out_n == 16 && in_n == 16)
+ return ix86_get_builtin (IX86_BUILTIN_CEILPH256);
+ else if (out_n == 32 && in_n == 32)
+ return ix86_get_builtin (IX86_BUILTIN_CEILPH512);
+ }
break;
CASE_CFN_TRUNC:
@@ -1702,6 +1730,20 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
else if (out_n == 16 && in_n == 16)
return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
}
+ if (out_mode == HFmode && in_mode == HFmode)
+ {
+ /* V8HF/V16HF is supported in ix86_vector_mode_supported_p
+ under TARGET_AVX512FP16, TARGET_AVX512VL is needed here. */
+ if (out_n < 32 && !TARGET_AVX512VL)
+ break;
+
+ if (out_n == 8 && in_n == 8)
+ return ix86_get_builtin (IX86_BUILTIN_TRUNCPH);
+ else if (out_n == 16 && in_n == 16)
+ return ix86_get_builtin (IX86_BUILTIN_TRUNCPH256);
+ else if (out_n == 32 && in_n == 32)
+ return ix86_get_builtin (IX86_BUILTIN_TRUNCPH512);
+ }
break;
CASE_CFN_FMA:
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 4c3800e..fa5cf77 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -9586,6 +9586,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V4SF_FTYPE_V4SF_ROUND:
case V8SF_FTYPE_V8SF_ROUND:
case V16SF_FTYPE_V16SF_ROUND:
+ case V8HF_FTYPE_V8HF_ROUND:
+ case V16HF_FTYPE_V16HF_ROUND:
+ case V32HF_FTYPE_V32HF_ROUND:
case V4SI_FTYPE_V4SF_ROUND:
case V8SI_FTYPE_V8SF_ROUND:
case V16SI_FTYPE_V16SF_ROUND:
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 4685ac9..0a7f5b1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -21936,18 +21936,18 @@
(set_attr "mode" "TI")])
(define_expand "nearbyint<mode>2"
- [(set (match_operand:VF 0 "register_operand")
- (unspec:VF
- [(match_operand:VF 1 "vector_operand")
+ [(set (match_operand:VFH 0 "register_operand")
+ (unspec:VFH
+ [(match_operand:VFH 1 "vector_operand")
(match_dup 2)]
UNSPEC_ROUND))]
"TARGET_SSE4_1"
"operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);")
(define_expand "rint<mode>2"
- [(set (match_operand:VF 0 "register_operand")
- (unspec:VF
- [(match_operand:VF 1 "vector_operand")
+ [(set (match_operand:VFH 0 "register_operand")
+ (unspec:VFH
+ [(match_operand:VFH 1 "vector_operand")
(match_dup 2)]
UNSPEC_ROUND))]
"TARGET_SSE4_1"
diff --git a/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
new file mode 100644
index 0000000..a76d9e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
@@ -0,0 +1,115 @@
+/* PR target/102464. */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512fp16 -mavx512vl -mprefer-vector-width=512" } */
+#include<math.h>
+void
+foo (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 8; i++)
+ a[i] = floor (b[i]);
+}
+
+void
+foo1 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 8; i++)
+ a[i] = ceil (b[i]);
+}
+
+void
+foo2 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 8; i++)
+ a[i] = trunc (b[i]);
+}
+
+void
+foo3 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 8; i++)
+ a[i] = nearbyint (b[i]);
+}
+
+void
+foo4 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 8; i++)
+ a[i] = rint (b[i]);
+}
+
+void
+foo5 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 16; i++)
+ a[i] = floor (b[i]);
+}
+
+void
+foo6 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 16; i++)
+ a[i] = ceil (b[i]);
+}
+
+void
+foo7 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 16; i++)
+ a[i] = trunc (b[i]);
+}
+
+void
+foo8 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 16; i++)
+ a[i] = nearbyint (b[i]);
+}
+
+void
+foo9 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 16; i++)
+ a[i] = rint (b[i]);
+}
+
+void
+foo10 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 32; i++)
+ a[i] = floor (b[i]);
+}
+
+void
+foo11 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 32; i++)
+ a[i] = ceil (b[i]);
+}
+
+void
+foo12 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 32; i++)
+ a[i] = trunc (b[i]);
+}
+
+void
+foo13 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 32; i++)
+ a[i] = nearbyint (b[i]);
+}
+
+void
+foo14 (_Float16* __restrict a, _Float16* b)
+{
+ for (int i = 0; i != 32; i++)
+ a[i] = rint (b[i]);
+}
+
+/* { dg-final { scan-assembler-not "vcvtsh2s\[sd\]" } } */
+/* { dg-final { scan-assembler-not "vcvtph2p\[sd\]" } } */
+/* { dg-final { scan-assembler-not "extendhfxf" } } */
+/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*xmm\[0-9\]" 5 } } */
+/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*ymm\[0-9\]" 5 } } */
+/* { dg-final { scan-assembler-times "vrndscaleph\[^\n\r\]*zmm\[0-9\]" 5 } } */