aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2020-06-03 17:25:47 +0800
committerliuhongt <hongtao.liu@intel.com>2020-06-15 09:43:15 +0800
commit54cdb2f5a5b01a482d7cbce30e7b738558eecf59 (patch)
treeccae80dbe6df3f68a0d5eba65c76b43cb8a2ad88 /gcc
parent0dcb572c08a7b4596f5481cb4491d755a63578ab (diff)
downloadgcc-54cdb2f5a5b01a482d7cbce30e7b738558eecf59.zip
gcc-54cdb2f5a5b01a482d7cbce30e7b738558eecf59.tar.gz
gcc-54cdb2f5a5b01a482d7cbce30e7b738558eecf59.tar.bz2
Optimize multiplication for V8QI,V16QI,V32QI under TARGET_AVX512BW.
2020-06-13 Hongtao Liu <hongtao.liu@intel.com> gcc/ChangeLog: PR target/95488 * config/i386/i386-expand.c (ix86_expand_vecmul_qihi): New function. * config/i386/i386-protos.h (ix86_expand_vecmul_qihi): Declare. * config/i386/sse.md (mul<mode>3): Drop mask_name since there's no real simd int8 multiplication instruction with mask. Also optimize it under TARGET_AVX512BW. (mulv8qi3): New expander. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512bw-pr95488-1.c: New test. * gcc.target/i386/avx512bw-pr95488-2.c: Ditto. * gcc.target/i386/avx512vl-pr95488-1.c: Ditto. * gcc.target/i386/avx512vl-pr95488-2.c: Ditto.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386-expand.c65
-rw-r--r--gcc/config/i386/i386-protos.h1
-rw-r--r--gcc/config/i386/sse.md16
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c21
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c47
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c36
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c50
7 files changed, 234 insertions, 2 deletions
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 270585d..3a414f6 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -19466,6 +19466,71 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
gcc_assert (ok);
}
+/* Optimize vector MUL generation for V8QI, V16QI and V32QI
+ under TARGET_AVX512BW. i.e. for v16qi a * b, it has
+
+ vpmovzxbw ymm2, xmm0
+ vpmovzxbw ymm3, xmm1
+ vpmullw ymm4, ymm2, ymm3
+ vpmovwb xmm0, ymm4
+
+ it would take less instructions than ix86_expand_vecop_qihi.
+ Return true if success. */
+
+bool
+ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
+{
+ machine_mode himode, qimode = GET_MODE (dest);
+ rtx hop1, hop2, hdest;
+ rtx (*gen_extend)(rtx, rtx);
+ rtx (*gen_truncate)(rtx, rtx);
+
+ /* There's no V64HImode multiplication instruction. */
+ if (qimode == E_V64QImode)
+ return false;
+
+ /* vpmovwb only available under AVX512BW. */
+ if (!TARGET_AVX512BW)
+ return false;
+ if ((qimode == V8QImode || qimode == V16QImode)
+ && !TARGET_AVX512VL)
+ return false;
+ /* Not generate zmm instruction when prefer 128/256 bit vector width. */
+ if (qimode == V32QImode
+ && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
+ return false;
+
+ switch (qimode)
+ {
+ case E_V8QImode:
+ himode = V8HImode;
+ gen_extend = gen_zero_extendv8qiv8hi2;
+ gen_truncate = gen_truncv8hiv8qi2;
+ break;
+ case E_V16QImode:
+ himode = V16HImode;
+ gen_extend = gen_zero_extendv16qiv16hi2;
+ gen_truncate = gen_truncv16hiv16qi2;
+ break;
+ case E_V32QImode:
+ himode = V32HImode;
+ gen_extend = gen_zero_extendv32qiv32hi2;
+ gen_truncate = gen_truncv32hiv32qi2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ hop1 = gen_reg_rtx (himode);
+ hop2 = gen_reg_rtx (himode);
+ hdest = gen_reg_rtx (himode);
+ emit_insn (gen_extend (hop1, op1));
+ emit_insn (gen_extend (hop2, op2));
+ emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
+ hop1, hop2)));
+ emit_insn (gen_truncate (dest, hdest));
+ return true;
+}
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index e557449..f532049 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -204,6 +204,7 @@ extern void ix86_expand_round (rtx, rtx);
extern void ix86_expand_rounddf_32 (rtx, rtx);
extern void ix86_expand_round_sse4 (rtx, rtx);
+extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx);
extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
extern rtx ix86_split_stack_guard (void);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7815d77..aa9fdc8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -11658,12 +11658,24 @@
(set_attr "prefix" "orig,maybe_evex")
(set_attr "mode" "TI")])
-(define_expand "mul<mode>3<mask_name>"
+(define_expand "mulv8qi3"
+ [(set (match_operand:V8QI 0 "register_operand")
+ (mult:V8QI (match_operand:V8QI 1 "register_operand")
+ (match_operand:V8QI 2 "register_operand")))]
+ "TARGET_AVX512VL && TARGET_AVX512BW"
+{
+ gcc_assert (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_expand "mul<mode>3"
[(set (match_operand:VI1_AVX512 0 "register_operand")
(mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand")
(match_operand:VI1_AVX512 2 "register_operand")))]
- "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
+ "TARGET_SSE2"
{
+ if (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2]))
+ DONE;
ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
DONE;
})
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c
new file mode 100644
index 0000000..594e511
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c
@@ -0,0 +1,21 @@
+/* PR target/95488 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" } */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*zmm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 2 } } */
+
+typedef char v32qi __attribute__ ((vector_size (32)));
+typedef unsigned char v32uqi __attribute__ ((vector_size (32)));
+
+__attribute__((noipa)) v32qi
+mul_512 (v32qi a, v32qi b)
+{
+ return a * b;
+}
+
+__attribute__((noipa)) v32uqi
+umul_512 (v32uqi a, v32uqi b)
+{
+ return a * b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c
new file mode 100644
index 0000000..de31966
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+#include "avx512bw-pr95488-1.c"
+
+#define TEST_MULB(typeV, typeS, N, fn) \
+do \
+ { \
+ typeV v1, v2, res; \
+ typeS s1[N], s2[N], exp[N]; \
+ int i,j; \
+ \
+ for (i = 0; i < N; i++) \
+ { \
+ s1[i] = i * i; \
+ s2[i] = i + 20; \
+ } \
+ for (i = 0; i < N; i++) \
+ exp[i] = s1[i] * s2[i]; \
+ v1 = *(typeV *)&s1[0]; \
+ v2 = *(typeV *)&s2[0]; \
+ res = fn (v1, v2); \
+ for (j = 0; j < N; j++) \
+ { \
+ if (res[j] != exp[j]) \
+ abort(); \
+ } \
+} \
+while (0)
+
+static void
+TEST (void)
+{
+ TEST_MULB (v32qi, char, 32, mul_512);
+ TEST_MULB (v32uqi, unsigned char, 32, umul_512);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
new file mode 100644
index 0000000..b3674fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
@@ -0,0 +1,36 @@
+/* PR target/pr95488 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 8 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*ymm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 4 } } */
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef unsigned char v16uqi __attribute__ ((vector_size (16)));
+typedef unsigned char v8uqi __attribute__ ((vector_size (8)));
+
+__attribute__((noipa)) v8qi
+mul_128 (v8qi a, v8qi b)
+{
+ return a * b;
+}
+
+__attribute__((noipa)) v16qi
+mul_256 (v16qi a, v16qi b)
+{
+ return a * b;
+}
+
+__attribute__((noipa)) v8uqi
+umul_128 (v8uqi a, v8uqi b)
+{
+ return a * b;
+}
+
+__attribute__((noipa)) v16uqi
+umul_256 (v16uqi a, v16uqi b)
+{
+ return a * b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c
new file mode 100644
index 0000000..45d7437
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c
@@ -0,0 +1,50 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+#include "avx512vl-pr95488-1.c"
+
+#define TEST_MULB(typeV, typeS, N, fn) \
+do \
+ { \
+ typeV v1, v2, res; \
+ int i,j; \
+ typeS s1[N], s2[N], exp[N]; \
+ \
+ for (i = 0; i < N; i++) \
+ { \
+ s1[i] = i * i; \
+ s2[i] = i + 20; \
+ } \
+ for (i = 0; i < N; i++) \
+ exp[i] = s1[i] * s2[i]; \
+ v1 = *(typeV *)s1; \
+ v2 = *(typeV *)s2; \
+ res = fn (v1, v2); \
+ for (j = 0; j < N; j++) \
+ { \
+ if (res[j] != exp[j]) \
+ abort(); \
+ } \
+ } \
+while (0)
+
+static void
+TEST (void)
+{
+ TEST_MULB(v8qi, char, 8, mul_128);
+ TEST_MULB(v8uqi, unsigned char, 8, umul_128);
+ TEST_MULB(v16qi, char, 16, mul_256);
+ TEST_MULB(v16uqi, unsigned char, 16, umul_256);
+}