aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2023-10-09 15:07:54 +0800
committerliuhongt <hongtao.liu@intel.com>2023-10-30 11:10:01 +0800
commit8c40b72036c967fbb1d1150515cf70aec382f0a2 (patch)
tree416aba24f88619b92a2d3ebcd5689e1a7f58a3a8
parent8111b5c23bd14f80607bd35af58ec31e38a0378e (diff)
downloadgcc-8c40b72036c967fbb1d1150515cf70aec382f0a2.zip
gcc-8c40b72036c967fbb1d1150515cf70aec382f0a2.tar.gz
gcc-8c40b72036c967fbb1d1150515cf70aec382f0a2.tar.bz2
Improve memcmpeq for 512-bit vector with vpcmpeq + kortest.
When 2 vectors are equal, kmask is allones and kortest will set CF, else CF will be cleared. So CF bit can be used to check for the result of the comparison. Before: vmovdqu (%rsi), %ymm0 vpxorq (%rdi), %ymm0, %ymm0 vptest %ymm0, %ymm0 jne .L2 vmovdqu 32(%rsi), %ymm0 vpxorq 32(%rdi), %ymm0, %ymm0 vptest %ymm0, %ymm0 je .L5 .L2: movl $1, %eax xorl $1, %eax vzeroupper ret After: vmovdqu64 (%rsi), %zmm0 xorl %eax, %eax vpcmpeqd (%rdi), %zmm0, %k0 kortestw %k0, %k0 setc %al vzeroupper ret gcc/ChangeLog: PR target/104610 * config/i386/i386-expand.cc (ix86_expand_branch): Handle 512-bit vector with vpcmpeq + kortest. * config/i386/i386.md (cbranchxi4): New expander. * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode and V8DImode. gcc/testsuite/ChangeLog: * gcc.target/i386/pr104610-2.c: New test.
-rw-r--r--gcc/config/i386/i386-expand.cc55
-rw-r--r--gcc/config/i386/i386.md16
-rw-r--r--gcc/config/i386/sse.md36
-rw-r--r--gcc/testsuite/gcc.target/i386/pr104610-2.c14
4 files changed, 99 insertions, 22 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 768053c..6ae5830 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2413,30 +2413,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
rtx tmp;
/* Handle special case - vector comparsion with boolean result, transform
- it using ptest instruction. */
+ it using ptest instruction or vpcmpeq + kortest. */
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|| (mode == TImode && !TARGET_64BIT)
- || mode == OImode)
+ || mode == OImode
+ || GET_MODE_SIZE (mode) == 64)
{
- rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
- machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
+ unsigned msize = GET_MODE_SIZE (mode);
+ machine_mode p_mode
+ = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
+ /* kortest set CF when result is 0xFFFF (op0 == op1). */
+ rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
gcc_assert (code == EQ || code == NE);
- if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+ /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors. */
+ if (msize == 64)
{
- op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
- op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
- mode = p_mode;
+ if (mode != V16SImode)
+ {
+ op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+ op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+ }
+
+ tmp = gen_reg_rtx (HImode);
+ emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
+ emit_insn (gen_kortesthi_ccc (tmp, tmp));
+ }
+ /* Using ptest for 128/256-bit vectors. */
+ else
+ {
+ if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+ {
+ op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
+ op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
+ mode = p_mode;
+ }
+
+ /* Generate XOR since we can't check that one operand is zero
+ vector. */
+ tmp = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
+ tmp = gen_lowpart (p_mode, tmp);
+ emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+ gen_rtx_UNSPEC (CCZmode,
+ gen_rtvec (2, tmp, tmp),
+ UNSPEC_PTEST)));
}
- /* Generate XOR since we can't check that one operand is zero vector. */
- tmp = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
- tmp = gen_lowpart (p_mode, tmp);
- emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
- gen_rtx_UNSPEC (CCZmode,
- gen_rtvec (2, tmp, tmp),
- UNSPEC_PTEST)));
tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, label),
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index eb4121b..92fbd57 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1442,6 +1442,22 @@
DONE;
})
+(define_expand "cbranchxi4"
+ [(set (reg:CC FLAGS_REG)
+ (compare:CC (match_operand:XI 1 "nonimmediate_operand")
+ (match_operand:XI 2 "nonimmediate_operand")))
+ (set (pc) (if_then_else
+ (match_operator 0 "bt_comparison_operator"
+ [(reg:CC FLAGS_REG) (const_int 0)])
+ (label_ref (match_operand 3))
+ (pc)))]
+ "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
+{
+ ix86_expand_branch (GET_CODE (operands[0]),
+ operands[1], operands[2], operands[3]);
+ DONE;
+})
+
(define_expand "cstore<mode>4"
[(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e2a7cbe..906212f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2175,9 +2175,9 @@
(set_attr "type" "msklog")
(set_attr "prefix" "vex")])
-(define_insn "kortest<mode>"
- [(set (reg:CC FLAGS_REG)
- (unspec:CC
+(define_insn "*kortest<mode>"
+ [(set (reg FLAGS_REG)
+ (unspec
[(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
(match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
UNSPEC_KORTEST))]
@@ -2187,6 +2187,30 @@
(set_attr "type" "msklog")
(set_attr "prefix" "vex")])
+(define_insn "kortest<mode>_ccc"
+ [(set (reg:CCC FLAGS_REG)
+ (unspec:CCC
+ [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+ (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+ UNSPEC_KORTEST))]
+ "TARGET_AVX512F")
+
+(define_insn "kortest<mode>_ccz"
+ [(set (reg:CCZ FLAGS_REG)
+ (unspec:CCZ
+ [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+ (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+ UNSPEC_KORTEST))]
+ "TARGET_AVX512F")
+
+(define_expand "kortest<mode>"
+ [(set (reg:CC FLAGS_REG)
+ (unspec:CC
+ [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
+ (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
+ UNSPEC_KORTEST))]
+ "TARGET_AVX512F")
+
(define_insn "kunpckhi"
[(set (match_operand:HI 0 "register_operand" "=k")
(ior:HI
@@ -27825,14 +27849,14 @@
(define_expand "cbranch<mode>4"
[(set (reg:CC FLAGS_REG)
- (compare:CC (match_operand:VI48_AVX 1 "register_operand")
- (match_operand:VI48_AVX 2 "nonimmediate_operand")))
+ (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
+ (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
(set (pc) (if_then_else
(match_operator 0 "bt_comparison_operator"
[(reg:CC FLAGS_REG) (const_int 0)])
(label_ref (match_operand 3))
(pc)))]
- "TARGET_SSE4_1"
+ "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
{
ix86_expand_branch (GET_CODE (operands[0]),
operands[1], operands[2], operands[3]);
diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c b/gcc/testsuite/gcc.target/i386/pr104610-2.c
new file mode 100644
index 0000000..999ef92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -mtune=generic" } */
+/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
+/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
+
+int compare (const char* s1, const char* s2)
+{
+ return __builtin_memcmp (s1, s2, 64) == 0;
+}
+
+int compare1 (const char* s1, const char* s2)
+{
+ return __builtin_memcmp (s1, s2, 64) != 0;
+}