aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386')
-rw-r--r--gcc/config/i386/driver-i386.cc24
-rw-r--r--gcc/config/i386/i386-c.cc7
-rw-r--r--gcc/config/i386/i386-expand.cc6
-rw-r--r--gcc/config/i386/i386-features.cc141
-rw-r--r--gcc/config/i386/i386-jit.cc12
-rw-r--r--gcc/config/i386/i386-options.cc4
-rw-r--r--gcc/config/i386/i386.h2
-rw-r--r--gcc/config/i386/sse.md257
8 files changed, 434 insertions, 19 deletions
diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
index fe71f55..0557df9 100644
--- a/gcc/config/i386/driver-i386.cc
+++ b/gcc/config/i386/driver-i386.cc
@@ -553,6 +553,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
processor = PROCESSOR_PENTIUM;
break;
case 6:
+ case 18:
case 19:
processor = PROCESSOR_PENTIUMPRO;
break;
@@ -639,18 +640,27 @@ const char *host_detect_local_cpu (int argc, const char **argv)
}
else if (has_feature (FEATURE_AVX))
{
- /* Assume Panther Lake. */
- if (has_feature (FEATURE_PREFETCHI))
- cpu = "pantherlake";
/* Assume Clearwater Forest. */
- else if (has_feature (FEATURE_USER_MSR))
+ if (has_feature (FEATURE_USER_MSR))
cpu = "clearwaterforest";
- /* Assume Arrow Lake S. */
+ /* Assume Nova Lake. */
+ else if (has_feature (FEATURE_PREFETCHI))
+ cpu = "novalake";
else if (has_feature (FEATURE_SM3))
- cpu = "arrowlake-s";
+ {
+ if (has_feature (FEATURE_KL))
+ /* Assume Arrow Lake S. */
+ cpu = "arrowlake-s";
+ else
+ /* Assume Panther Lake. */
+ cpu = "pantherlake";
+ }
/* Assume Sierra Forest. */
- else if (has_feature (FEATURE_AVXVNNIINT8))
+ else if (has_feature (FEATURE_CLDEMOTE))
cpu = "sierraforest";
+ /* Assume Arrow Lake. */
+ else if (has_feature (FEATURE_AVXVNNIINT8))
+ cpu = "arrowlake";
/* Assume Alder Lake. */
else if (has_feature (FEATURE_SERIALIZE))
cpu = "alderlake";
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 0037465..2d92cee 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -295,6 +295,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__diamondrapids");
def_or_undef (parse_in, "__diamondrapids__");
break;
+ case PROCESSOR_NOVALAKE:
+ def_or_undef (parse_in, "__novalake");
+ def_or_undef (parse_in, "__novalake__");
+ break;
/* use PROCESSOR_max to not set/unset the arch macro. */
case PROCESSOR_max:
@@ -498,6 +502,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
case PROCESSOR_DIAMONDRAPIDS:
def_or_undef (parse_in, "__tune_diamondrapids__");
break;
+ case PROCESSOR_NOVALAKE:
+ def_or_undef (parse_in, "__tune_novalake__");
+ break;
case PROCESSOR_INTEL:
case PROCESSOR_GENERIC:
break;
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5bcc35c..a1f1b26 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -9515,9 +9515,9 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
machine_mode move_mode = VOIDmode;
int unroll_factor = 1;
/* TODO: Once value ranges are available, fill in proper data. */
- unsigned HOST_WIDE_INT min_size = 0;
- unsigned HOST_WIDE_INT max_size = -1;
- unsigned HOST_WIDE_INT probable_max_size = -1;
+ unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
+ unsigned HOST_WIDE_INT max_size = HOST_WIDE_INT_M1U;
+ unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
bool misaligned_prologue_used = false;
addr_space_t dst_as, src_as = ADDR_SPACE_GENERIC;
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 9348f55..8e27784 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -449,6 +449,30 @@ scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref,
return true;
}
+/* Check whether X is a convertible *concatditi_? variant. X is known
+ to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI. */
+
+static bool
+timode_concatdi_p (rtx x)
+{
+ rtx op0 = XEXP (x, 0);
+ rtx op1 = XEXP (x, 1);
+
+ if (GET_CODE (op1) == ASHIFT)
+ std::swap (op0, op1);
+
+ return GET_CODE (op0) == ASHIFT
+ && GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
+ && GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode
+ && REG_P (XEXP (XEXP (op0, 0), 0))
+ && CONST_INT_P (XEXP (op0, 1))
+ && INTVAL (XEXP (op0, 1)) == 64
+ && GET_CODE (op1) == ZERO_EXTEND
+ && GET_MODE (XEXP (op1, 0)) == DImode
+ && REG_P (XEXP (op1, 0));
+}
+
+
/* Add instruction into a chain. Return true if OK, false if the search
was aborted. */
@@ -477,9 +501,26 @@ scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid,
if (!analyze_register_chain (candidates, ref, disallowed))
return false;
- /* The operand(s) of VEC_SELECT don't need to be converted/convertible. */
- if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT)
- return true;
+ /* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need
+ to be converted/convertible. */
+ if (def_set)
+ switch (GET_CODE (SET_SRC (def_set)))
+ {
+ case VEC_SELECT:
+ return true;
+ case ZERO_EXTEND:
+ if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode)
+ return true;
+ break;
+ case PLUS:
+ case IOR:
+ case XOR:
+ if (smode == TImode && timode_concatdi_p (SET_SRC (def_set)))
+ return true;
+ break;
+ default:
+ break;
+ }
for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
if (!DF_REF_REG_MEM_P (ref))
@@ -1628,14 +1669,34 @@ timode_scalar_chain::compute_convert_gain ()
break;
case AND:
+ if (!MEM_P (dst))
+ igain = COSTS_N_INSNS (1);
+ if (CONST_SCALAR_INT_P (XEXP (src, 1)))
+ igain += timode_immed_const_gain (XEXP (src, 1), bb);
+ break;
+
case XOR:
case IOR:
+ if (timode_concatdi_p (src))
+ {
+ /* vmovq;vpinsrq (11 bytes). */
+ igain = speed_p ? -2 * ix86_cost->sse_to_integer
+ : -COSTS_N_BYTES (11);
+ break;
+ }
if (!MEM_P (dst))
igain = COSTS_N_INSNS (1);
if (CONST_SCALAR_INT_P (XEXP (src, 1)))
igain += timode_immed_const_gain (XEXP (src, 1), bb);
break;
+ case PLUS:
+ if (timode_concatdi_p (src))
+ /* vmovq;vpinsrq (11 bytes). */
+ igain = speed_p ? -2 * ix86_cost->sse_to_integer
+ : -COSTS_N_BYTES (11);
+ break;
+
case ASHIFT:
case LSHIFTRT:
/* See ix86_expand_v1ti_shift. */
@@ -1794,6 +1855,13 @@ timode_scalar_chain::compute_convert_gain ()
igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
break;
+ case ZERO_EXTEND:
+ if (GET_MODE (XEXP (src, 0)) == DImode)
+ /* xor (2 bytes) vs. vmovq (5 bytes). */
+ igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer
+ : -COSTS_N_BYTES (3);
+ break;
+
default:
break;
}
@@ -1858,6 +1926,28 @@ timode_scalar_chain::fix_debug_reg_uses (rtx reg)
}
}
+/* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction.
+ Insert this before INSN, and return the result as a V1TImode subreg. */
+
+static rtx
+timode_convert_concatdi (rtx src, rtx_insn *insn)
+{
+ rtx hi, lo;
+ rtx tmp = gen_reg_rtx (V2DImode);
+ if (GET_CODE (XEXP (src, 0)) == ASHIFT)
+ {
+ hi = XEXP (XEXP (XEXP (src, 0), 0), 0);
+ lo = XEXP (XEXP (src, 1), 0);
+ }
+ else
+ {
+ hi = XEXP (XEXP (XEXP (src, 1), 0), 0);
+ lo = XEXP (XEXP (src, 0), 0);
+ }
+ emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn);
+ return gen_rtx_SUBREG (V1TImode, tmp, 0);
+}
+
/* Convert INSN from TImode to V1T1mode. */
void
@@ -1967,10 +2057,24 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
PUT_MODE (src, V1TImode);
break;
}
- /* FALLTHRU */
+ convert_op (&XEXP (src, 0), insn);
+ convert_op (&XEXP (src, 1), insn);
+ PUT_MODE (src, V1TImode);
+ if (MEM_P (dst))
+ {
+ tmp = gen_reg_rtx (V1TImode);
+ emit_insn_before (gen_rtx_SET (tmp, src), insn);
+ src = tmp;
+ }
+ break;
case XOR:
case IOR:
+ if (timode_concatdi_p (src))
+ {
+ src = timode_convert_concatdi (src, insn);
+ break;
+ }
convert_op (&XEXP (src, 0), insn);
convert_op (&XEXP (src, 1), insn);
PUT_MODE (src, V1TImode);
@@ -2010,6 +2114,26 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
PUT_MODE (src, V1TImode);
break;
+ case ZERO_EXTEND:
+ if (GET_MODE (XEXP (src, 0)) == DImode)
+ {
+ /* Convert to *vec_concatv2di_0. */
+ rtx tmp = gen_reg_rtx (V2DImode);
+ rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx);
+ emit_insn_before (gen_move_insn (tmp, pat), insn);
+ src = gen_rtx_SUBREG (vmode, tmp, 0);
+ }
+ else
+ gcc_unreachable ();
+ break;
+
+ case PLUS:
+ if (timode_concatdi_p (src))
+ src = timode_convert_concatdi (src, insn);
+ else
+ gcc_unreachable ();
+ break;
+
default:
gcc_unreachable ();
}
@@ -2389,6 +2513,8 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
case IOR:
case XOR:
+ if (timode_concatdi_p (src))
+ return true;
return (REG_P (XEXP (src, 0))
|| timode_mem_p (XEXP (src, 0)))
&& (REG_P (XEXP (src, 1))
@@ -2408,6 +2534,13 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
&& CONST_INT_P (XEXP (src, 1))
&& (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
+ case PLUS:
+ return timode_concatdi_p (src);
+
+ case ZERO_EXTEND:
+ return REG_P (XEXP (src, 0))
+ && GET_MODE (XEXP (src, 0)) == DImode;
+
default:
return false;
}
diff --git a/gcc/config/i386/i386-jit.cc b/gcc/config/i386/i386-jit.cc
index c1e2929..73ca590 100644
--- a/gcc/config/i386/i386-jit.cc
+++ b/gcc/config/i386/i386-jit.cc
@@ -65,6 +65,18 @@ ix86_jit_register_target_info (void)
jit_target_add_supported_target_dependent_type (GCC_JIT_TYPE_INT128_T);
}
+ if (float16_type_node != NULL && TYPE_PRECISION (float16_type_node) == 16)
+ jit_target_add_supported_target_dependent_type (GCC_JIT_TYPE_FLOAT16);
+
+ if (float32_type_node != NULL && TYPE_PRECISION (float32_type_node) == 32)
+ jit_target_add_supported_target_dependent_type (GCC_JIT_TYPE_FLOAT32);
+
+ if (float64_type_node != NULL && TYPE_PRECISION (float64_type_node) == 64)
+ jit_target_add_supported_target_dependent_type (GCC_JIT_TYPE_FLOAT64);
+
+ if (float128_type_node != NULL && TYPE_PRECISION (float128_type_node) == 128)
+ jit_target_add_supported_target_dependent_type (GCC_JIT_TYPE_FLOAT128);
+
#define ADD_TARGET_INFO jit_add_target_info
#include "i386-rust-and-jit.inc"
#undef ADD_TARGET_INFO
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 35cba3f..dadcf76 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -132,6 +132,7 @@ along with GCC; see the file COPYING3. If not see
#define m_ARROWLAKE_S (HOST_WIDE_INT_1U<<PROCESSOR_ARROWLAKE_S)
#define m_PANTHERLAKE (HOST_WIDE_INT_1U<<PROCESSOR_PANTHERLAKE)
#define m_DIAMONDRAPIDS (HOST_WIDE_INT_1U<<PROCESSOR_DIAMONDRAPIDS)
+#define m_NOVALAKE (HOST_WIDE_INT_1U<<PROCESSOR_NOVALAKE)
#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
| m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \
| m_TIGERLAKE | m_COOPERLAKE | m_SAPPHIRERAPIDS \
@@ -140,7 +141,7 @@ along with GCC; see the file COPYING3. If not see
#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2)
#define m_CORE_HYBRID (m_ALDERLAKE | m_ARROWLAKE | m_ARROWLAKE_S \
- | m_PANTHERLAKE)
+ | m_PANTHERLAKE | m_NOVALAKE)
#define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
@@ -790,6 +791,7 @@ static const struct processor_costs *processor_cost_table[] =
&alderlake_cost, /* PROCESSOR_ARROWLAKE_S. */
&alderlake_cost, /* PROCESSOR_PANTHERLAKE. */
&icelake_cost, /* PROCESSOR_DIAMONDRAPIDS. */
+ &alderlake_cost, /* PROCESSOR_NOVALAKE. */
&alderlake_cost, /* PROCESSOR_INTEL. */
&lujiazui_cost, /* PROCESSOR_LUJIAZUI. */
&yongfeng_cost, /* PROCESSOR_YONGFENG. */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 3a66d78..94f335f 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2356,6 +2356,7 @@ enum processor_type
PROCESSOR_ARROWLAKE_S,
PROCESSOR_PANTHERLAKE,
PROCESSOR_DIAMONDRAPIDS,
+ PROCESSOR_NOVALAKE,
PROCESSOR_INTEL,
PROCESSOR_LUJIAZUI,
PROCESSOR_YONGFENG,
@@ -2487,6 +2488,7 @@ constexpr wide_int_bitmask PTA_DIAMONDRAPIDS = PTA_GRANITERAPIDS_D
| PTA_CMPCCXADD | PTA_SHA512 | PTA_SM3 | PTA_SM4 | PTA_AVX10_2
| PTA_APX_F | PTA_AMX_AVX512 | PTA_AMX_FP8 | PTA_AMX_TF32 | PTA_MOVRS
| PTA_AMX_MOVRS;
+constexpr wide_int_bitmask PTA_NOVALAKE = PTA_PANTHERLAKE | PTA_PREFETCHI;
constexpr wide_int_bitmask PTA_BDVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_POPCNT | PTA_LZCNT
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8b28c8e..5eba992 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -569,6 +569,18 @@
(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
+(define_mode_iterator VI_AVX
+ [(V32QI "TARGET_AVX") V16QI
+ (V16HI "TARGET_AVX") V8HI
+ (V8SI "TARGET_AVX") V4SI
+ (V4DI "TARGET_AVX") V2DI])
+
+(define_mode_iterator VI_AVX2_CMP
+ [(V32QI "TARGET_AVX2") V16QI
+ (V16HI "TARGET_AVX2") V8HI
+ (V8SI "TARGET_AVX2") V4SI
+ (V4DI "TARGET_AVX2") V2DI])
+
(define_mode_iterator VI_AVX_AVX512F
[(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
@@ -896,7 +908,8 @@
(define_mode_attr ssebytemode
[(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")
(V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")
- (V8HI "V16QI")])
+ (V16HI "V32QI") (V8HI "V16QI")
+ (V32QI "V32QI") (V16QI "V16QI")])
(define_mode_attr sseintconvert
[(V32HI "w") (V16HI "w") (V8HI "w")
@@ -4013,6 +4026,170 @@
DONE;
})
+(define_expand "reduc_sbool_and_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")
+ (match_operand:SI 2 "const_int_operand")]
+ "TARGET_AVX512F"
+{
+ int n_elt = INTVAL (operands[2]);
+ rtx op2 = CONSTM1_RTX (<MODE>mode);
+ rtx op1 = operands[1];
+ if (n_elt < 8)
+ {
+ op2 = gen_int_mode ((1u << n_elt) - 1, QImode);
+ op1 = gen_reg_rtx (QImode);
+ emit_insn (gen_andqi3 (op1, operands[1], op2));
+ }
+ ix86_expand_setcc (operands[0], EQ, op1, op2);
+ DONE;
+})
+
+(define_expand "reduc_sbool_ior_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")
+ (match_operand:SI 2 "const_int_operand")]
+ "TARGET_AVX512F"
+{
+ int n_elt = INTVAL (operands[2]);
+ rtx op1 = operands[1];
+ if (n_elt < 8)
+ {
+ rtx op2 = gen_int_mode ((1u << n_elt) - 1, QImode);
+ op1 = gen_reg_rtx (QImode);
+ emit_insn (gen_andqi3 (op1, operands[1], op2));
+ }
+ ix86_expand_setcc (operands[0], NE,
+ op1, CONST0_RTX (<MODE>mode));
+ DONE;
+})
+
+(define_expand "reduc_sbool_xor_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")
+ (match_operand:SI 2 "const_int_operand")]
+ "TARGET_AVX512F && TARGET_POPCNT
+ && (TARGET_64BIT || <MODE>mode != DImode)"
+{
+ rtx popcnt1, op1 = operands[1];
+ int n_elt = INTVAL (operands[2]);
+ if (n_elt < 8)
+ {
+ rtx op2 = gen_int_mode ((1u << n_elt) - 1, QImode);
+ op1 = gen_reg_rtx (QImode);
+ emit_insn (gen_andqi3 (op1, operands[1], op2));
+ }
+
+ switch (<MODE_SIZE>)
+ {
+ case 1:
+ case 2:
+ op1 = gen_reg_rtx (SImode);
+ emit_move_insn (op1, gen_rtx_ZERO_EXTEND (SImode, operands[1]));
+ /* FALLTHRU. */
+ case 4:
+ popcnt1 = gen_reg_rtx (SImode);
+ emit_insn (gen_popcountsi2 (popcnt1, op1));
+ emit_insn (gen_andsi3 (popcnt1, popcnt1, GEN_INT (0x1)));
+ break;
+
+ case 8:
+ popcnt1 = gen_reg_rtx (DImode);
+ emit_insn (gen_popcountdi2 (popcnt1, op1));
+ emit_insn (gen_anddi3 (popcnt1, popcnt1, GEN_INT (0x1)));
+ break;
+
+ default:
+ gcc_unreachable ();
+
+ }
+
+ emit_move_insn (operands[0], gen_lowpart (QImode, popcnt1));
+ DONE;
+})
+
+(define_expand "reduc_sbool_and_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:VI_AVX 1 "register_operand")]
+ "TARGET_SSE4_1"
+{
+ rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+ rtx op2, tmp;
+ if (TARGET_AVX2 || <MODE_SIZE> != 32)
+ {
+ op2 = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+ tmp = gen_reg_rtx (<MODE>mode);
+ rtx op1 = gen_rtx_EQ (<MODE>mode, operands[1], op2);
+ emit_insn (gen_vec_cmp<mode><mode> (tmp, op1, operands[1], op2));
+ }
+ else
+ {
+ op2 = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+ tmp = gen_reg_rtx (<MODE>mode);
+ rtx ops[3] = { tmp, operands[1], op2 };
+ ix86_expand_vector_logical_operator (XOR, <MODE>mode, ops);
+ }
+
+ tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec(2, tmp, tmp), UNSPEC_PTEST);
+ emit_insn (gen_rtx_SET (flags, tmp));
+ rtx ret = gen_rtx_fmt_ee (EQ, VOIDmode, flags, const0_rtx);
+ PUT_MODE (ret, QImode);
+ emit_insn (gen_rtx_SET (operands[0], ret));
+ DONE;
+
+})
+
+(define_expand "reduc_sbool_ior_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:VI_AVX 1 "register_operand")]
+ "TARGET_SSE4_1"
+{
+ rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+ rtx tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec(2, operands[1], operands[1]), UNSPEC_PTEST);
+ emit_insn (gen_rtx_SET (flags, tmp));
+ rtx ret = gen_rtx_fmt_ee (NE, VOIDmode, flags, const0_rtx);
+ PUT_MODE (ret, QImode);
+ emit_insn (gen_rtx_SET (operands[0], ret));
+ DONE;
+})
+
+(define_expand "reduc_sbool_xor_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:VI1_AVX2 1 "register_operand")]
+ "TARGET_SSE2 && TARGET_POPCNT"
+{
+ rtx popcnt1 = gen_reg_rtx (SImode);
+ emit_insn (gen_<sse2_avx2>_pmovmskb (popcnt1,operands[1]));
+
+ emit_insn (gen_popcountsi2 (popcnt1, popcnt1));
+ emit_insn (gen_andsi3 (popcnt1, popcnt1, GEN_INT (0x1)));
+
+ emit_move_insn (operands[0], gen_lowpart (QImode, popcnt1));
+ DONE;
+})
+
+(define_mode_attr ssefltvecmode
+ [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
+
+(define_expand "reduc_sbool_xor_scal_<mode>"
+ [(match_operand:QI 0 "register_operand")
+ (match_operand:VI48_AVX 1 "register_operand")]
+ "TARGET_SSE2 && TARGET_POPCNT"
+{
+ rtx popcnt1 = gen_reg_rtx (SImode);
+ rtx tmp = gen_rtx_UNSPEC (SImode, gen_rtvec(1,
+ gen_lowpart (<ssefltvecmode>mode,
+ operands[1])),
+ UNSPEC_MOVMSK);
+ emit_insn (gen_rtx_SET (popcnt1, tmp));
+
+ emit_insn (gen_popcountsi2 (popcnt1, popcnt1));
+ emit_insn (gen_andsi3 (popcnt1, popcnt1, GEN_INT (0x1)));
+
+ emit_move_insn (operands[0], gen_lowpart (QImode, popcnt1));
+ DONE;
+})
+
(define_insn "<mask_codefor>reducep<mode><mask_name><round_saeonly_name>"
[(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
(unspec:VFH_AVX512VL
@@ -4632,6 +4809,33 @@
UNSPEC_PCMP_ITER))]
"operands[4] = GEN_INT (INTVAL (operands[3]) ^ 4);")
+(define_insn_and_split "*<avx512>_cmp<mode>3_dup_op"
+ [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
+ (unspec:<avx512fmaskmode>
+ [(match_operand:VI1248_AVX512VLBW 1 "general_operand")
+ (match_operand:VI1248_AVX512VLBW 2 "general_operand")
+ (match_operand:SI 3 "<cmp_imm_predicate>")]
+ UNSPEC_PCMP_ITER))]
+ "TARGET_AVX512F && ix86_pre_reload_split ()
+ && rtx_equal_p (operands[1], operands[2])"
+ "#"
+ "&& 1"
+ [(set (match_dup 0) (match_dup 4))]
+{
+ int cmp_imm = INTVAL (operands[3]);
+ rtx res = CONST0_RTX (<avx512fmaskmode>mode);
+ /* EQ/LE/NLT. */
+ if (cmp_imm == 0 || cmp_imm == 2 || cmp_imm == 5)
+ {
+ int nelts = GET_MODE_NUNITS (<MODE>mode);
+ if (nelts >= 8)
+ res = CONSTM1_RTX (<avx512fmaskmode>mode);
+ else
+ res = gen_int_mode ((1u << nelts) - 1, QImode);
+ }
+ operands[4] = res;
+})
+
(define_insn "*<avx512>_eq<mode>3<mask_scalar_merge_name>_1"
[(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k,k")
(unspec:<avx512fmaskmode>
@@ -17975,6 +18179,24 @@
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
+(define_insn_and_split "*eq<mode>3_2_negate"
+ [(set (match_operand:VI_AVX2_CMP 0 "register_operand")
+ (eq:VI_AVX2_CMP
+ (eq:VI_AVX2_CMP
+ (eq: VI_AVX2_CMP
+ (match_operand:VI_AVX2_CMP 1 "nonimmediate_operand")
+ (match_operand:VI_AVX2_CMP 2 "general_operand"))
+ (match_operand:VI_AVX2_CMP 3 "const0_operand"))
+ (match_operand:VI_AVX2_CMP 4 "const0_operand")))]
+ "TARGET_SSE4_1 && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (eq:VI_AVX2_CMP (match_dup 1)
+ (match_dup 5)))]
+ "operands[5] = force_reg (<MODE>mode, operands[2]);")
+
+
(define_insn_and_split "*avx2_pcmp<mode>3_1"
[(set (match_operand:VI_128_256 0 "register_operand")
(vec_merge:VI_128_256
@@ -23665,9 +23887,6 @@
(set_attr "btver2_decode" "vector,vector,vector")
(set_attr "mode" "<MODE>")])
-(define_mode_attr ssefltvecmode
- [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
-
(define_insn_and_split "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint"
[(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x")
(unspec:<ssebytemode>
@@ -25482,6 +25701,36 @@
(match_dup 0)
(pc)))])
+
+;; (unspec:ccz [(eq (eq op0 const0) const0)] unspec_ptest)
+;; is equal to (unspec:ccz [op0 op0] unspec_ptest).
+(define_insn_and_split "*ptest<mode>_ccz"
+ [(set (reg:CCZ FLAGS_REG)
+ (unspec:CCZ
+ [(eq:VI_AVX
+ (eq:VI_AVX
+ (match_operand:VI_AVX 0 "vector_operand")
+ (match_operand:VI_AVX 1 "const0_operand"))
+ (match_operand:VI_AVX 2 "const0_operand"))
+ (eq:VI_AVX
+ (eq:VI_AVX (match_dup 0) (match_dup 1))
+ (match_dup 2))]
+ UNSPEC_PTEST))]
+ "TARGET_SSE4_1
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (reg:CCZ FLAGS_REG)
+ (unspec:CCZ
+ [(match_dup 3) (match_dup 3)]
+ UNSPEC_PTEST))]
+{
+ if (MEM_P (operands[0]))
+ operands[3] = force_reg (<MODE>mode, operands[0]);
+ else
+ operands[3] = operands[0];
+})
+
(define_expand "nearbyint<mode>2"
[(set (match_operand:VFH 0 "register_operand")
(unspec:VFH