aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2020-12-18 15:56:06 +0800
committerliuhongt <hongtao.liu@intel.com>2021-01-22 12:35:14 +0800
commitee78c20e74d30284fee36e22a64e86e45e676029 (patch)
treef36f196ec99d2fb29394a30cb5d4df8ddd9ff448 /gcc
parentbca467e56fe111fa6d876656c60d5704065e83fe (diff)
downloadgcc-ee78c20e74d30284fee36e22a64e86e45e676029.zip
gcc-ee78c20e74d30284fee36e22a64e86e45e676029.tar.gz
gcc-ee78c20e74d30284fee36e22a64e86e45e676029.tar.bz2
Lower AVX512 vector comparison to AVX version when dest is vector.
gcc/ChangeLog: PR target/96891 PR target/98348 * config/i386/sse.md (VI_128_256): New mode iterator. (*avx_cmp<mode>3_1, *avx_cmp<mode>3_2, *avx_cmp<mode>3_3, *avx_cmp<mode>3_4, *avx2_eq<mode>3, *avx2_pcmp<mode>3_1, *avx2_pcmp<mode>3_2, *avx2_gt<mode>3): New define_insn_and_split to lower avx512 vector comparison to avx version when dest is vector. (*<avx512>_cmp<mode>3,*<avx512>_cmp<mode>3,*<avx512>_ucmp<mode>3): define_insn_and_split for negating the comparison result. * config/i386/predicates.md (float_vector_all_ones_operand): New predicate. * config/i386/i386-expand.c (ix86_expand_sse_movcc): Use general NOT operator without UNSPEC_MASKOP. gcc/testsuite/ChangeLog: PR target/96891 PR target/98348 * gcc.target/i386/avx512bw-pr96891-1.c: New test. * gcc.target/i386/avx512f-pr96891-1.c: New test. * gcc.target/i386/avx512f-pr96891-2.c: New test. * gcc.target/i386/avx512f-pr96891-3.c: New test. * g++.target/i386/avx512f-pr96891-1.C: New test. * gcc.target/i386/bitwise_mask_op-3.c: Adjust testcase.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386-expand.c14
-rw-r--r--gcc/config/i386/predicates.md47
-rw-r--r--gcc/config/i386/sse.md261
-rw-r--r--gcc/testsuite/g++.target/i386/avx512f-pr96891-1.C37
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512bw-pr96891-1.c75
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-pr96891-1.c40
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-pr96891-2.c30
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c39
-rw-r--r--gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c1
9 files changed, 531 insertions, 13 deletions
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 280645f..d64b4ac 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -3568,17 +3568,11 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
? force_reg (mode, op_false) : op_false);
if (op_true == CONST0_RTX (mode))
{
- rtx (*gen_not) (rtx, rtx);
- switch (cmpmode)
- {
- case E_QImode: gen_not = gen_knotqi; break;
- case E_HImode: gen_not = gen_knothi; break;
- case E_SImode: gen_not = gen_knotsi; break;
- case E_DImode: gen_not = gen_knotdi; break;
- default: gcc_unreachable ();
- }
rtx n = gen_reg_rtx (cmpmode);
- emit_insn (gen_not (n, cmp));
+ if (cmpmode == E_DImode && !TARGET_64BIT)
+ emit_insn (gen_knotdi (n, cmp));
+ else
+ emit_insn (gen_rtx_SET (n, gen_rtx_fmt_e (NOT, cmpmode, cmp)));
cmp = n;
/* Reverse op_true op_false. */
std::swap (op_true, op_false);
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 0a3ab4d..ee42ba2 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1069,6 +1069,53 @@
return true;
})
+/* Return true if operand is a float vector constant that is all ones. */
+(define_predicate "float_vector_all_ones_operand"
+ (match_code "const_vector,mem")
+{
+ mode = GET_MODE (op);
+ if (!FLOAT_MODE_P (mode)
+ || (MEM_P (op)
+ && (!SYMBOL_REF_P (XEXP (op, 0))
+ || !CONSTANT_POOL_ADDRESS_P (XEXP (op, 0)))))
+ return false;
+
+ if (MEM_P (op))
+ {
+ op = get_pool_constant (XEXP (op, 0));
+ if (GET_CODE (op) != CONST_VECTOR)
+ return false;
+
+ if (GET_MODE (op) != mode
+ && INTEGRAL_MODE_P (GET_MODE (op))
+ && op == CONSTM1_RTX (GET_MODE (op)))
+ return true;
+ }
+
+ rtx first = XVECEXP (op, 0, 0);
+ for (int i = 1; i != GET_MODE_NUNITS (GET_MODE (op)); i++)
+ {
+ rtx tmp = XVECEXP (op, 0, i);
+ if (!rtx_equal_p (tmp, first))
+ return false;
+ }
+ if (GET_MODE (first) == E_SFmode)
+ {
+ long l;
+ REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (first), l);
+ return (l & 0xffffffff) == 0xffffffff;
+ }
+ else if (GET_MODE (first) == E_DFmode)
+ {
+ long l[2];
+ REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (first), l);
+ return ((l[0] & 0xffffffff) == 0xffffffff
+ && (l[1] & 0xffffffff) == 0xffffffff);
+ }
+ else
+ return false;
+})
+
/* Return true if operand is a vector constant that is all ones. */
(define_predicate "vector_all_ones_operand"
(and (match_code "const_vector")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 42d4c44..9683a99 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -648,6 +648,9 @@
;; All 256bit vector integer modes
(define_mode_iterator VI_256 [V32QI V16HI V8SI V4DI])
+;; All 128 and 256bit vector integer modes
+(define_mode_iterator VI_128_256 [V16QI V8HI V4SI V2DI V32QI V16HI V8SI V4DI])
+
;; Various 128bit vector integer mode combinations
(define_mode_iterator VI12_128 [V16QI V8HI])
(define_mode_iterator VI14_128 [V16QI V4SI])
@@ -2965,6 +2968,102 @@
(set_attr "prefix" "vex")
(set_attr "mode" "<MODE>")])
+(define_insn_and_split "*avx_cmp<mode>3_1"
+ [(set (match_operand:<sseintvecmode> 0 "register_operand")
+ (vec_merge:<sseintvecmode>
+ (match_operand:<sseintvecmode> 1 "vector_all_ones_operand")
+ (match_operand:<sseintvecmode> 2 "const0_operand")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VF_128_256 3 "register_operand")
+ (match_operand:VF_128_256 4 "nonimmediate_operand")
+ (match_operand:SI 5 "const_0_to_31_operand")]
+ UNSPEC_PCMP)))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 6)
+ (unspec:VF_128_256
+ [(match_dup 3)
+ (match_dup 4)
+ (match_dup 5)]
+ UNSPEC_PCMP))
+ (set (match_dup 0) (match_dup 7))]
+{
+ operands[6] = gen_reg_rtx (<MODE>mode);
+ operands[7]
+ = lowpart_subreg (GET_MODE (operands[0]), operands[6], <MODE>mode);
+})
+
+(define_insn_and_split "*avx_cmp<mode>3_2"
+ [(set (match_operand:<sseintvecmode> 0 "register_operand")
+ (vec_merge:<sseintvecmode>
+ (match_operand:<sseintvecmode> 1 "vector_all_ones_operand")
+ (match_operand:<sseintvecmode> 2 "const0_operand")
+ (not:<avx512fmaskmode>
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VF_128_256 3 "register_operand")
+ (match_operand:VF_128_256 4 "nonimmediate_operand")
+ (match_operand:SI 5 "const_0_to_31_operand")]
+ UNSPEC_PCMP))))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 6)
+ (unspec:VF_128_256
+ [(match_dup 3)
+ (match_dup 4)
+ (match_dup 5)]
+ UNSPEC_PCMP))
+ (set (match_dup 0) (match_dup 7))]
+{
+ operands[5] = GEN_INT (INTVAL (operands[5]) ^ 4);
+ operands[6] = gen_reg_rtx (<MODE>mode);
+ operands[7]
+ = lowpart_subreg (GET_MODE (operands[0]), operands[6], <MODE>mode);
+})
+
+(define_insn_and_split "*avx_cmp<mode>3_3"
+ [(set (match_operand:VF_128_256 0 "register_operand")
+ (vec_merge:VF_128_256
+ (match_operand:VF_128_256 1 "float_vector_all_ones_operand")
+ (match_operand:VF_128_256 2 "const0_operand")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VF_128_256 3 "register_operand")
+ (match_operand:VF_128_256 4 "nonimmediate_operand")
+ (match_operand:SI 5 "const_0_to_31_operand")]
+ UNSPEC_PCMP)))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:VF_128_256
+ [(match_dup 3)
+ (match_dup 4)
+ (match_dup 5)]
+ UNSPEC_PCMP))])
+
+(define_insn_and_split "*avx_cmp<mode>3_4"
+ [(set (match_operand:VF_128_256 0 "register_operand")
+ (vec_merge:VF_128_256
+ (match_operand:VF_128_256 1 "float_vector_all_ones_operand")
+ (match_operand:VF_128_256 2 "const0_operand")
+ (not:<avx512fmaskmode>
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VF_128_256 3 "register_operand")
+ (match_operand:VF_128_256 4 "nonimmediate_operand")
+ (match_operand:SI 5 "const_0_to_31_operand")]
+ UNSPEC_PCMP))))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:VF_128_256
+ [(match_dup 3)
+ (match_dup 4)
+ (match_dup 5)]
+ UNSPEC_PCMP))]
+ "operands[5] = GEN_INT (INTVAL (operands[5]) ^ 4);")
+
(define_insn "avx_vmcmp<mode>3"
[(set (match_operand:VF_128 0 "register_operand" "=x")
(vec_merge:VF_128
@@ -3056,6 +3155,25 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*<avx512>_cmp<mode>3"
+ [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
+ (not:<avx512fmaskmode>
+ (unspec:<avx512fmaskmode>
+ [(match_operand:V48_AVX512VL 1 "register_operand")
+ (match_operand:V48_AVX512VL 2 "nonimmediate_operand")
+ (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+ UNSPEC_PCMP)))]
+ "TARGET_AVX512F && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:<avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 4)]
+ UNSPEC_PCMP))]
+ "operands[4] = GEN_INT (INTVAL (operands[3]) ^ 4);")
+
(define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
(unspec:<avx512fmaskmode>
@@ -3070,6 +3188,28 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_int_iterator UNSPEC_PCMP_ITER
+ [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
+
+(define_insn_and_split "*<avx512>_cmp<mode>3"
+ [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
+ (not:<avx512fmaskmode>
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI12_AVX512VL 1 "register_operand")
+ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand")
+ (match_operand:SI 3 "<cmp_imm_predicate>")]
+ UNSPEC_PCMP_ITER)))]
+ "TARGET_AVX512BW && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:<avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 4)]
+ UNSPEC_PCMP_ITER))]
+ "operands[4] = GEN_INT (INTVAL (operands[3]) ^ 4);")
+
(define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
(unspec:<avx512fmaskmode>
@@ -3098,8 +3238,24 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
-(define_int_iterator UNSPEC_PCMP_ITER
- [UNSPEC_PCMP UNSPEC_UNSIGNED_PCMP])
+(define_insn_and_split "*<avx512>_ucmp<mode>3"
+ [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
+ (not:<avx512fmaskmode>
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI48_AVX512VL 1 "register_operand")
+ (match_operand:VI48_AVX512VL 2 "nonimmediate_operand")
+ (match_operand:SI 3 "const_0_to_7_operand")]
+ UNSPEC_UNSIGNED_PCMP)))]
+ "TARGET_AVX512F && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (unspec:<avx512fmaskmode>
+ [(match_dup 1)
+ (match_dup 2)
+ (match_dup 4)]
+ UNSPEC_UNSIGNED_PCMP))]
+ "operands[4] = GEN_INT (INTVAL (operands[3]) ^ 4);")
(define_int_attr pcmp_signed_mask
[(UNSPEC_PCMP "3") (UNSPEC_UNSIGNED_PCMP "1")])
@@ -12733,6 +12889,89 @@
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
+(define_insn_and_split "*avx2_eq<mode>3"
+ [(set (match_operand:VI_128_256 0 "register_operand")
+ (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "vector_all_ones_operand")
+ (match_operand:VI_128_256 2 "const0_operand")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI_128_256 3 "nonimmediate_operand")
+ (match_operand:VI_128_256 4 "nonimmediate_operand")]
+ UNSPEC_MASKED_EQ)))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()
+ && !(MEM_P (operands[3]) && MEM_P (operands[4]))"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (eq:VI_128_256
+ (match_dup 3)
+ (match_dup 4)))])
+
+(define_insn_and_split "*avx2_pcmp<mode>3_1"
+ [(set (match_operand:VI_128_256 0 "register_operand")
+ (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "vector_all_ones_operand")
+ (match_operand:VI_128_256 2 "const0_operand")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI_128_256 3 "nonimmediate_operand")
+ (match_operand:VI_128_256 4 "nonimmediate_operand")
+ (match_operand:SI 5 "const_0_to_7_operand")]
+ UNSPEC_PCMP)))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()
+ /* EQ is commutative. */
+ && ((INTVAL (operands[5]) == 0
+ && !(MEM_P (operands[3]) && MEM_P (operands[4])))
+ /* NLE aka GT, 3 must be register. */
+ || (INTVAL (operands[5]) == 6
+ && !MEM_P (operands[3]))
+ /* LT, 4 must be register and we swap operands. */
+ || (INTVAL (operands[5]) == 1
+ && !MEM_P (operands[4])))"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ if (INTVAL (operands[5]) == 1)
+ std::swap (operands[3], operands[4]);
+ enum rtx_code code = INTVAL (operands[5]) ? GT : EQ;
+ emit_move_insn (operands[0], gen_rtx_fmt_ee (code, <MODE>mode,
+ operands[3], operands[4]));
+ DONE;
+})
+
+(define_insn_and_split "*avx2_pcmp<mode>3_2"
+ [(set (match_operand:VI_128_256 0 "register_operand")
+ (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "vector_all_ones_operand")
+ (match_operand:VI_128_256 2 "const0_operand")
+ (not:<avx512fmaskmode>
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI_128_256 3 "nonimmediate_operand")
+ (match_operand:VI_128_256 4 "nonimmediate_operand")
+ (match_operand:SI 5 "const_0_to_7_operand")]
+ UNSPEC_PCMP))))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()
+ /* NE is commutative. */
+ && ((INTVAL (operands[5]) == 4
+ && !(MEM_P (operands[3]) && MEM_P (operands[4])))
+ /* LE, 3 must be register. */
+ || (INTVAL (operands[5]) == 2
+ && !MEM_P (operands[3]))
+ /* NLT aka GE, 4 must be register and we swap operands. */
+ || (INTVAL (operands[5]) == 5
+ && !MEM_P (operands[4])))"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ if (INTVAL (operands[5]) == 5)
+ std::swap (operands[3], operands[4]);
+ enum rtx_code code = INTVAL (operands[5]) != 4 ? GT : EQ;
+ emit_move_insn (operands[0], gen_rtx_fmt_ee (code, <MODE>mode,
+ operands[3], operands[4]));
+ DONE;
+})
+
(define_expand "<avx512>_eq<mode>3<mask_scalar_merge_name>"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand")
(unspec:<avx512fmaskmode>
@@ -12857,6 +13096,24 @@
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
+(define_insn_and_split "*avx2_gt<mode>3"
+ [(set (match_operand:VI_128_256 0 "register_operand")
+ (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "vector_all_ones_operand")
+ (match_operand:VI_128_256 2 "const0_operand")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI_128_256 3 "register_operand")
+ (match_operand:VI_128_256 4 "nonimmediate_operand")]
+ UNSPEC_MASKED_GT)))]
+ "TARGET_AVX512VL
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (gt:VI_128_256
+ (match_dup 3)
+ (match_dup 4)))])
+
(define_insn "<avx512>_gt<mode>3<mask_scalar_merge_name>"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
(unspec:<avx512fmaskmode>
diff --git a/gcc/testsuite/g++.target/i386/avx512f-pr96891-1.C b/gcc/testsuite/g++.target/i386/avx512f-pr96891-1.C
new file mode 100644
index 0000000..969a085
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/avx512f-pr96891-1.C
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -O2" } */
+/* { dg-final { scan-assembler-not "%k\[0-7\]" } } */
+
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef float v8sf __attribute__ ((vector_size (32)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef double v4df __attribute__ ((vector_size (32)));
+
+
+v4sf
+foo_v4sf (v4sf x)
+{
+ const union U { unsigned u; float f; } u = { -1U };
+ return x > 0.0f ? u.f : 0.0f;
+}
+
+v8sf
+foo_v8sf (v8sf x)
+{
+ const union U { unsigned u; float f; } u = { -1U };
+ return x > 0.0f ? u.f : 0.0f;
+}
+
+v2df
+foo_v2df (v2df x)
+{
+ const union U { unsigned long long u; double df; } u = { -1ULL };
+ return x > 0.0 ? u.df : 0.0;
+}
+
+v4df
+foo_v4df (v4df x)
+{
+ const union U { unsigned long long u; double df; } u = { -1ULL };
+ return x > 0.0 ? u.df : 0.0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr96891-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr96891-1.c
new file mode 100644
index 0000000..d899ceb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr96891-1.c
@@ -0,0 +1,75 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-not "%k\[0-7\]" } } */
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef char v32qi __attribute__ ((vector_size (32)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef short v16hi __attribute__ ((vector_size (32)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef int v8si __attribute__ ((vector_size (32)));
+typedef long long v2di __attribute__ ((vector_size (16)));
+typedef long long v4di __attribute__ ((vector_size (32)));
+
+#define FOO(VTYPE, OPNAME, OP) \
+ VTYPE \
+ foo_##VTYPE##_##OPNAME (VTYPE a, VTYPE b) \
+ { \
+ return a OP b; \
+ } \
+
+#define FOO1(VTYPE, OPNAME, OP) \
+ VTYPE \
+ foo1_##VTYPE##_##OPNAME (VTYPE a, VTYPE b) \
+ { \
+ return ~(a OP b); \
+ } \
+
+FOO (v16qi, eq, ==)
+FOO1 (v16qi, neq, !=)
+FOO (v16qi, gt, >)
+FOO (v16qi, lt, <)
+FOO1 (v16qi, le, <=)
+FOO1 (v16qi, ge, >=)
+FOO (v32qi, eq, ==)
+FOO1 (v32qi, neq, !=)
+FOO (v32qi, gt, >)
+FOO (v32qi, lt, <)
+FOO1 (v32qi, le, <=)
+FOO1 (v32qi, ge, >=)
+FOO (v8hi, eq, ==)
+FOO1 (v8hi, neq, !=)
+FOO (v8hi, gt, >)
+FOO (v8hi, lt, <)
+FOO1 (v8hi, le, <=)
+FOO1 (v8hi, ge, >=)
+FOO (v16hi, eq, ==)
+FOO1 (v16hi, neq, !=)
+FOO (v16hi, gt, >)
+FOO (v16hi, lt, <)
+FOO1 (v16hi, le, <=)
+FOO1 (v16hi, ge, >=)
+FOO (v4si, eq, ==)
+FOO1 (v4si, neq, !=)
+FOO (v4si, gt, >)
+FOO (v4si, lt, <)
+FOO1 (v4si, le, <=)
+FOO1 (v4si, ge, >=)
+FOO (v8si, eq, ==)
+FOO1 (v8si, neq, !=)
+FOO (v8si, gt, >)
+FOO (v8si, lt, <)
+FOO1 (v8si, le, <=)
+FOO1 (v8si, ge, >=)
+FOO (v2di, eq, ==)
+FOO1 (v2di, neq, !=)
+FOO (v2di, gt, >)
+FOO (v2di, lt, <)
+FOO1 (v2di, le, <=)
+FOO1 (v2di, ge, >=)
+FOO (v4di, eq, ==)
+FOO1 (v4di, neq, !=)
+FOO (v4di, gt, >)
+FOO (v4di, lt, >)
+FOO1 (v4di, le, <=)
+FOO1 (v4di, ge, >=)
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr96891-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr96891-1.c
new file mode 100644
index 0000000..48ba943
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr96891-1.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -O2" } */
+/* { dg-final { scan-assembler-not "%k\[0-7\]" } } */
+
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef float v8sf __attribute__ ((vector_size (32)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef double v4df __attribute__ ((vector_size (32)));
+
+#define FOO(VTYPE, OPNAME, OP) \
+ VTYPE \
+ foo_##VTYPE##_##OPNAME (VTYPE a, VTYPE b) \
+ { \
+ return a OP b; \
+ } \
+
+FOO (v4sf, eq, ==)
+FOO (v4sf, neq, !=)
+FOO (v4sf, gt, >)
+FOO (v4sf, ge, >=)
+FOO (v4sf, lt, <)
+FOO (v4sf, le, <=)
+FOO (v8sf, eq, ==)
+FOO (v8sf, neq, !=)
+FOO (v8sf, gt, >)
+FOO (v8sf, ge, >=)
+FOO (v8sf, lt, <)
+FOO (v8sf, le, <=)
+FOO (v2df, eq, ==)
+FOO (v2df, neq, !=)
+FOO (v2df, gt, >)
+FOO (v2df, ge, >=)
+FOO (v2df, lt, <)
+FOO (v2df, le, <=)
+FOO (v4df, eq, ==)
+FOO (v4df, neq, !=)
+FOO (v4df, gt, >)
+FOO (v4df, ge, >=)
+FOO (v4df, lt, <)
+FOO (v4df, le, <=)
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr96891-2.c b/gcc/testsuite/gcc.target/i386/avx512f-pr96891-2.c
new file mode 100644
index 0000000..5192a00
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr96891-2.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bw -mavx512dq -O2" } */
+/* { dg-final { scan-assembler-not "%k\[0-7\]" } } */
+
+#include<immintrin.h>
+
+#define FOO(VTYPE,PREFIX,SUFFIX,OPNAME,MASK,LEN) \
+ VTYPE \
+ foo_##LEN##_##SUFFIX##_##OPNAME (VTYPE a, VTYPE b) \
+ { \
+ MASK m = _mm##PREFIX##_cmp##OPNAME##_##SUFFIX##_mask (a, b); \
+ return _mm##PREFIX##_movm_##SUFFIX (m); \
+ } \
+
+FOO (__m128i,, epi8, eq, __mmask16, 128);
+FOO (__m128i,, epi16, eq, __mmask8, 128);
+FOO (__m128i,, epi32, eq, __mmask8, 128);
+FOO (__m128i,, epi64, eq, __mmask8, 128);
+FOO (__m128i,, epi8, gt, __mmask16, 128);
+FOO (__m128i,, epi16, gt, __mmask8, 128);
+FOO (__m128i,, epi32, gt, __mmask8, 128);
+FOO (__m128i,, epi64, gt, __mmask8, 128);
+FOO (__m256i, 256, epi8, eq, __mmask32, 256);
+FOO (__m256i, 256, epi16, eq, __mmask16, 256);
+FOO (__m256i, 256, epi32, eq, __mmask8, 256);
+FOO (__m256i, 256, epi64, eq, __mmask8, 256);
+FOO (__m256i, 256, epi8, gt, __mmask32, 256);
+FOO (__m256i, 256, epi16, gt, __mmask16, 256);
+FOO (__m256i, 256, epi32, gt, __mmask8, 256);
+FOO (__m256i, 256, epi64, gt, __mmask8, 256);
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c b/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c
new file mode 100644
index 0000000..1cf18f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bw -mavx512dq -O2 -masm=att" } */
+/* { dg-final { scan-assembler-not {not[bwlqd]\]} } } */
+/* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$5} 4} } */
+/* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$6} 4} } */
+/* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$7} 4} } */
+/* { dg-final { scan-assembler-times {(?n)vcmpp[sd][ \t]*\$5} 2} } */
+/* { dg-final { scan-assembler-times {(?n)vcmpp[sd][ \t]*\$6} 2} } */
+/* { dg-final { scan-assembler-times {(?n)vcmpp[sd][ \t]*\$7} 2} } */
+
+#include<immintrin.h>
+
+#define FOO(VTYPE,PREFIX,SUFFIX,MASK,LEN,CMPIMM) \
+ MASK \
+ foo_##LEN##_##SUFFIX##_##CMPIMM (VTYPE a, VTYPE b) \
+ { \
+ MASK m = _mm##PREFIX##_cmp_##SUFFIX##_mask (a, b, CMPIMM); \
+ return ~m; \
+ } \
+
+FOO (__m128i,, epi8, __mmask16, 128, 1);
+FOO (__m128i,, epi16, __mmask8, 128, 1);
+FOO (__m128i,, epi32, __mmask8, 128, 1);
+FOO (__m128i,, epi64, __mmask8, 128, 1);
+FOO (__m256i, 256, epi8, __mmask32, 256, 2);
+FOO (__m256i, 256, epi16, __mmask16, 256, 2);
+FOO (__m256i, 256, epi32, __mmask8, 256, 2);
+FOO (__m256i, 256, epi64, __mmask8, 256, 2);
+FOO (__m512i, 512, epi8, __mmask64, 512, 3);
+FOO (__m512i, 512, epi16, __mmask32, 512, 3);
+FOO (__m512i, 512, epi32, __mmask16, 512, 3);
+FOO (__m512i, 512, epi64, __mmask8, 512, 3);
+
+FOO (__m128,, ps, __mmask8, 128, 1);
+FOO (__m128d,, pd, __mmask8, 128, 1);
+FOO (__m256, 256, ps, __mmask8, 256, 2);
+FOO (__m256d, 256, pd, __mmask8, 256, 2);
+FOO (__m512, 512, ps, __mmask16, 512, 3);
+FOO (__m512d, 512, pd, __mmask8, 512, 3);
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
index 18bf4f0..4a90786 100644
--- a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
@@ -40,5 +40,4 @@ foo_andnb (__m512i a, __m512i b)
foo = m1 & ~m2;
}
-/* { dg-final { scan-assembler-times "knotb\[\t \]" "1" } } */
/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4"} } */