diff options
Diffstat (limited to 'gcc/config/i386')
-rw-r--r-- | gcc/config/i386/driver-i386.cc | 24 | ||||
-rw-r--r-- | gcc/config/i386/i386-c.cc | 7 | ||||
-rw-r--r-- | gcc/config/i386/i386-expand.cc | 6 | ||||
-rw-r--r-- | gcc/config/i386/i386-features.cc | 141 | ||||
-rw-r--r-- | gcc/config/i386/i386-jit.cc | 12 | ||||
-rw-r--r-- | gcc/config/i386/i386-options.cc | 4 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 2 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 257 |
8 files changed, 434 insertions, 19 deletions
diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc index fe71f55..0557df9 100644 --- a/gcc/config/i386/driver-i386.cc +++ b/gcc/config/i386/driver-i386.cc @@ -553,6 +553,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) processor = PROCESSOR_PENTIUM; break; case 6: + case 18: case 19: processor = PROCESSOR_PENTIUMPRO; break; @@ -639,18 +640,27 @@ const char *host_detect_local_cpu (int argc, const char **argv) } else if (has_feature (FEATURE_AVX)) { - /* Assume Panther Lake. */ - if (has_feature (FEATURE_PREFETCHI)) - cpu = "pantherlake"; /* Assume Clearwater Forest. */ - else if (has_feature (FEATURE_USER_MSR)) + if (has_feature (FEATURE_USER_MSR)) cpu = "clearwaterforest"; - /* Assume Arrow Lake S. */ + /* Assume Nova Lake. */ + else if (has_feature (FEATURE_PREFETCHI)) + cpu = "novalake"; else if (has_feature (FEATURE_SM3)) - cpu = "arrowlake-s"; + { + if (has_feature (FEATURE_KL)) + /* Assume Arrow Lake S. */ + cpu = "arrowlake-s"; + else + /* Assume Panther Lake. */ + cpu = "pantherlake"; + } /* Assume Sierra Forest. */ - else if (has_feature (FEATURE_AVXVNNIINT8)) + else if (has_feature (FEATURE_CLDEMOTE)) cpu = "sierraforest"; + /* Assume Arrow Lake. */ + else if (has_feature (FEATURE_AVXVNNIINT8)) + cpu = "arrowlake"; /* Assume Alder Lake. */ else if (has_feature (FEATURE_SERIALIZE)) cpu = "alderlake"; diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc index 0037465..2d92cee 100644 --- a/gcc/config/i386/i386-c.cc +++ b/gcc/config/i386/i386-c.cc @@ -295,6 +295,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__diamondrapids"); def_or_undef (parse_in, "__diamondrapids__"); break; + case PROCESSOR_NOVALAKE: + def_or_undef (parse_in, "__novalake"); + def_or_undef (parse_in, "__novalake__"); + break; /* use PROCESSOR_max to not set/unset the arch macro. */ case PROCESSOR_max: @@ -498,6 +502,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, case PROCESSOR_DIAMONDRAPIDS: def_or_undef (parse_in, "__tune_diamondrapids__"); break; + case PROCESSOR_NOVALAKE: + def_or_undef (parse_in, "__tune_novalake__"); + break; case PROCESSOR_INTEL: case PROCESSOR_GENERIC: break; diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 5bcc35c..a1f1b26 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -9515,9 +9515,9 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, machine_mode move_mode = VOIDmode; int unroll_factor = 1; /* TODO: Once value ranges are available, fill in proper data. */ - unsigned HOST_WIDE_INT min_size = 0; - unsigned HOST_WIDE_INT max_size = -1; - unsigned HOST_WIDE_INT probable_max_size = -1; + unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U; + unsigned HOST_WIDE_INT max_size = HOST_WIDE_INT_M1U; + unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U; bool misaligned_prologue_used = false; addr_space_t dst_as, src_as = ADDR_SPACE_GENERIC; diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 9348f55..8e27784 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -449,6 +449,30 @@ scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref, return true; } +/* Check whether X is a convertible *concatditi_? variant. X is known + to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI. */ + +static bool +timode_concatdi_p (rtx x) +{ + rtx op0 = XEXP (x, 0); + rtx op1 = XEXP (x, 1); + + if (GET_CODE (op1) == ASHIFT) + std::swap (op0, op1); + + return GET_CODE (op0) == ASHIFT + && GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND + && GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode + && REG_P (XEXP (XEXP (op0, 0), 0)) + && CONST_INT_P (XEXP (op0, 1)) + && INTVAL (XEXP (op0, 1)) == 64 + && GET_CODE (op1) == ZERO_EXTEND + && GET_MODE (XEXP (op1, 0)) == DImode + && REG_P (XEXP (op1, 0)); +} + + /* Add instruction into a chain. Return true if OK, false if the search was aborted. */ @@ -477,9 +501,26 @@ scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid, if (!analyze_register_chain (candidates, ref, disallowed)) return false; - /* The operand(s) of VEC_SELECT don't need to be converted/convertible. */ - if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT) - return true; + /* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need + to be converted/convertible. */ + if (def_set) + switch (GET_CODE (SET_SRC (def_set))) + { + case VEC_SELECT: + return true; + case ZERO_EXTEND: + if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode) + return true; + break; + case PLUS: + case IOR: + case XOR: + if (smode == TImode && timode_concatdi_p (SET_SRC (def_set))) + return true; + break; + default: + break; + } for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) if (!DF_REF_REG_MEM_P (ref)) @@ -1628,14 +1669,34 @@ timode_scalar_chain::compute_convert_gain () break; case AND: + if (!MEM_P (dst)) + igain = COSTS_N_INSNS (1); + if (CONST_SCALAR_INT_P (XEXP (src, 1))) + igain += timode_immed_const_gain (XEXP (src, 1), bb); + break; + case XOR: case IOR: + if (timode_concatdi_p (src)) + { + /* vmovq;vpinsrq (11 bytes). */ + igain = speed_p ? -2 * ix86_cost->sse_to_integer + : -COSTS_N_BYTES (11); + break; + } if (!MEM_P (dst)) igain = COSTS_N_INSNS (1); if (CONST_SCALAR_INT_P (XEXP (src, 1))) igain += timode_immed_const_gain (XEXP (src, 1), bb); break; + case PLUS: + if (timode_concatdi_p (src)) + /* vmovq;vpinsrq (11 bytes). */ + igain = speed_p ? -2 * ix86_cost->sse_to_integer + : -COSTS_N_BYTES (11); + break; + case ASHIFT: case LSHIFTRT: /* See ix86_expand_v1ti_shift. */ @@ -1794,6 +1855,13 @@ timode_scalar_chain::compute_convert_gain () igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1); break; + case ZERO_EXTEND: + if (GET_MODE (XEXP (src, 0)) == DImode) + /* xor (2 bytes) vs. vmovq (5 bytes). */ + igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer + : -COSTS_N_BYTES (3); + break; + default: break; } @@ -1858,6 +1926,28 @@ timode_scalar_chain::fix_debug_reg_uses (rtx reg) } } +/* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction. + Insert this before INSN, and return the result as a V1TImode subreg. */ + +static rtx +timode_convert_concatdi (rtx src, rtx_insn *insn) +{ + rtx hi, lo; + rtx tmp = gen_reg_rtx (V2DImode); + if (GET_CODE (XEXP (src, 0)) == ASHIFT) + { + hi = XEXP (XEXP (XEXP (src, 0), 0), 0); + lo = XEXP (XEXP (src, 1), 0); + } + else + { + hi = XEXP (XEXP (XEXP (src, 1), 0), 0); + lo = XEXP (XEXP (src, 0), 0); + } + emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn); + return gen_rtx_SUBREG (V1TImode, tmp, 0); +} + /* Convert INSN from TImode to V1T1mode. */ void @@ -1967,10 +2057,24 @@ timode_scalar_chain::convert_insn (rtx_insn *insn) PUT_MODE (src, V1TImode); break; } - /* FALLTHRU */ + convert_op (&XEXP (src, 0), insn); + convert_op (&XEXP (src, 1), insn); + PUT_MODE (src, V1TImode); + if (MEM_P (dst)) + { + tmp = gen_reg_rtx (V1TImode); + emit_insn_before (gen_rtx_SET (tmp, src), insn); + src = tmp; + } + break; case XOR: case IOR: + if (timode_concatdi_p (src)) + { + src = timode_convert_concatdi (src, insn); + break; + } convert_op (&XEXP (src, 0), insn); convert_op (&XEXP (src, 1), insn); PUT_MODE (src, V1TImode); @@ -2010,6 +2114,26 @@ timode_scalar_chain::convert_insn (rtx_insn *insn) PUT_MODE (src, V1TImode); break; + case ZERO_EXTEND: + if (GET_MODE (XEXP (src, 0)) == DImode) + { + /* Convert to *vec_concatv2di_0. */ + rtx tmp = gen_reg_rtx (V2DImode); + rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx); + emit_insn_before (gen_move_insn (tmp, pat), insn); + src = gen_rtx_SUBREG (vmode, tmp, 0); + } + else + gcc_unreachable (); + break; + + case PLUS: + if (timode_concatdi_p (src)) + src = timode_convert_concatdi (src, insn); + else + gcc_unreachable (); + break; + default: gcc_unreachable (); } @@ -2389,6 +2513,8 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn) case IOR: case XOR: + if (timode_concatdi_p (src)) + return true; return (REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0))) && (REG_P (XEXP (src, 1)) @@ -2408,6 +2534,13 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn) && CONST_INT_P (XEXP (src, 1)) && (INTVAL (XEXP (src, 1)) & ~0x7f) == 0; + case PLUS: + return timode_concatdi_p (src); + + case ZERO_EXTEND: + return REG_P (XEXP (src, 0)) + && GET_MODE (XEXP (src, 0)) == DImode; + default: return false; } diff --git a/gcc/config/i386/i386-jit.cc b/gcc/config/i386/i386-jit.cc index c1e2929..73ca590 100644 --- a/gcc/config/i386/i386-jit.cc +++ b/gcc/config/i386/i386-jit.cc @@ -65,6 +65,18 @@ ix86_jit_register_target_info (void) jit_target_add_supported_target_dependent_type (GCC_JIT_TYPE_INT128_T); } + if (float16_type_node != NULL && TYPE_PRECISION (float16_type_node) == 16) + jit_target_add_supported_target_dependent_type (GCC_JIT_TYPE_FLOAT16); + + if (float32_type_node != NULL && TYPE_PRECISION (float32_type_node) == 32) + jit_target_add_supported_target_dependent_type (GCC_JIT_TYPE_FLOAT32); + + if (float64_type_node != NULL && TYPE_PRECISION (float64_type_node) == 64) + jit_target_add_supported_target_dependent_type (GCC_JIT_TYPE_FLOAT64); + + if (float128_type_node != NULL && TYPE_PRECISION (float128_type_node) == 128) + jit_target_add_supported_target_dependent_type (GCC_JIT_TYPE_FLOAT128); + #define ADD_TARGET_INFO jit_add_target_info #include "i386-rust-and-jit.inc" #undef ADD_TARGET_INFO diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 35cba3f..dadcf76 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -132,6 +132,7 @@ along with GCC; see the file COPYING3. If not see #define m_ARROWLAKE_S (HOST_WIDE_INT_1U<<PROCESSOR_ARROWLAKE_S) #define m_PANTHERLAKE (HOST_WIDE_INT_1U<<PROCESSOR_PANTHERLAKE) #define m_DIAMONDRAPIDS (HOST_WIDE_INT_1U<<PROCESSOR_DIAMONDRAPIDS) +#define m_NOVALAKE (HOST_WIDE_INT_1U<<PROCESSOR_NOVALAKE) #define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \ | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \ | m_TIGERLAKE | m_COOPERLAKE | m_SAPPHIRERAPIDS \ @@ -140,7 +141,7 @@ along with GCC; see the file COPYING3. If not see #define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512) #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2) #define m_CORE_HYBRID (m_ALDERLAKE | m_ARROWLAKE | m_ARROWLAKE_S \ - | m_PANTHERLAKE) + | m_PANTHERLAKE | m_NOVALAKE) #define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT) #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS) #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT) @@ -790,6 +791,7 @@ static const struct processor_costs *processor_cost_table[] = &alderlake_cost, /* PROCESSOR_ARROWLAKE_S. */ &alderlake_cost, /* PROCESSOR_PANTHERLAKE. */ &icelake_cost, /* PROCESSOR_DIAMONDRAPIDS. */ + &alderlake_cost, /* PROCESSOR_NOVALAKE. */ &alderlake_cost, /* PROCESSOR_INTEL. */ &lujiazui_cost, /* PROCESSOR_LUJIAZUI. */ &yongfeng_cost, /* PROCESSOR_YONGFENG. */ diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 3a66d78..94f335f 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2356,6 +2356,7 @@ enum processor_type PROCESSOR_ARROWLAKE_S, PROCESSOR_PANTHERLAKE, PROCESSOR_DIAMONDRAPIDS, + PROCESSOR_NOVALAKE, PROCESSOR_INTEL, PROCESSOR_LUJIAZUI, PROCESSOR_YONGFENG, @@ -2487,6 +2488,7 @@ constexpr wide_int_bitmask PTA_DIAMONDRAPIDS = PTA_GRANITERAPIDS_D | PTA_CMPCCXADD | PTA_SHA512 | PTA_SM3 | PTA_SM4 | PTA_AVX10_2 | PTA_APX_F | PTA_AMX_AVX512 | PTA_AMX_FP8 | PTA_AMX_TF32 | PTA_MOVRS | PTA_AMX_MOVRS; +constexpr wide_int_bitmask PTA_NOVALAKE = PTA_PANTHERLAKE | PTA_PREFETCHI; constexpr wide_int_bitmask PTA_BDVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_POPCNT | PTA_LZCNT diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 8b28c8e..5eba992 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -569,6 +569,18 @@ (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI]) +(define_mode_iterator VI_AVX + [(V32QI "TARGET_AVX") V16QI + (V16HI "TARGET_AVX") V8HI + (V8SI "TARGET_AVX") V4SI + (V4DI "TARGET_AVX") V2DI]) + +(define_mode_iterator VI_AVX2_CMP + [(V32QI "TARGET_AVX2") V16QI + (V16HI "TARGET_AVX2") V8HI + (V8SI "TARGET_AVX2") V4SI + (V4DI "TARGET_AVX2") V2DI]) + (define_mode_iterator VI_AVX_AVX512F [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI @@ -896,7 +908,8 @@ (define_mode_attr ssebytemode [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI") (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI") - (V8HI "V16QI")]) + (V16HI "V32QI") (V8HI "V16QI") + (V32QI "V32QI") (V16QI "V16QI")]) (define_mode_attr sseintconvert [(V32HI "w") (V16HI "w") (V8HI "w") @@ -4013,6 +4026,170 @@ DONE; }) +(define_expand "reduc_sbool_and_scal_<mode>" + [(match_operand:QI 0 "register_operand") + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand") + (match_operand:SI 2 "const_int_operand")] + "TARGET_AVX512F" +{ + int n_elt = INTVAL (operands[2]); + rtx op2 = CONSTM1_RTX (<MODE>mode); + rtx op1 = operands[1]; + if (n_elt < 8) + { + op2 = gen_int_mode ((1u << n_elt) - 1, QImode); + op1 = gen_reg_rtx (QImode); + emit_insn (gen_andqi3 (op1, operands[1], op2)); + } + ix86_expand_setcc (operands[0], EQ, op1, op2); + DONE; +}) + +(define_expand "reduc_sbool_ior_scal_<mode>" + [(match_operand:QI 0 "register_operand") + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand") + (match_operand:SI 2 "const_int_operand")] + "TARGET_AVX512F" +{ + int n_elt = INTVAL (operands[2]); + rtx op1 = operands[1]; + if (n_elt < 8) + { + rtx op2 = gen_int_mode ((1u << n_elt) - 1, QImode); + op1 = gen_reg_rtx (QImode); + emit_insn (gen_andqi3 (op1, operands[1], op2)); + } + ix86_expand_setcc (operands[0], NE, + op1, CONST0_RTX (<MODE>mode)); + DONE; +}) + +(define_expand "reduc_sbool_xor_scal_<mode>" + [(match_operand:QI 0 "register_operand") + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand") + (match_operand:SI 2 "const_int_operand")] + "TARGET_AVX512F && TARGET_POPCNT + && (TARGET_64BIT || <MODE>mode != DImode)" +{ + rtx popcnt1, op1 = operands[1]; + int n_elt = INTVAL (operands[2]); + if (n_elt < 8) + { + rtx op2 = gen_int_mode ((1u << n_elt) - 1, QImode); + op1 = gen_reg_rtx (QImode); + emit_insn (gen_andqi3 (op1, operands[1], op2)); + } + + switch (<MODE_SIZE>) + { + case 1: + case 2: + op1 = gen_reg_rtx (SImode); + emit_move_insn (op1, gen_rtx_ZERO_EXTEND (SImode, operands[1])); + /* FALLTHRU. */ + case 4: + popcnt1 = gen_reg_rtx (SImode); + emit_insn (gen_popcountsi2 (popcnt1, op1)); + emit_insn (gen_andsi3 (popcnt1, popcnt1, GEN_INT (0x1))); + break; + + case 8: + popcnt1 = gen_reg_rtx (DImode); + emit_insn (gen_popcountdi2 (popcnt1, op1)); + emit_insn (gen_anddi3 (popcnt1, popcnt1, GEN_INT (0x1))); + break; + + default: + gcc_unreachable (); + + } + + emit_move_insn (operands[0], gen_lowpart (QImode, popcnt1)); + DONE; +}) + +(define_expand "reduc_sbool_and_scal_<mode>" + [(match_operand:QI 0 "register_operand") + (match_operand:VI_AVX 1 "register_operand")] + "TARGET_SSE4_1" +{ + rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG); + rtx op2, tmp; + if (TARGET_AVX2 || <MODE_SIZE> != 32) + { + op2 = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode)); + tmp = gen_reg_rtx (<MODE>mode); + rtx op1 = gen_rtx_EQ (<MODE>mode, operands[1], op2); + emit_insn (gen_vec_cmp<mode><mode> (tmp, op1, operands[1], op2)); + } + else + { + op2 = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode)); + tmp = gen_reg_rtx (<MODE>mode); + rtx ops[3] = { tmp, operands[1], op2 }; + ix86_expand_vector_logical_operator (XOR, <MODE>mode, ops); + } + + tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec(2, tmp, tmp), UNSPEC_PTEST); + emit_insn (gen_rtx_SET (flags, tmp)); + rtx ret = gen_rtx_fmt_ee (EQ, VOIDmode, flags, const0_rtx); + PUT_MODE (ret, QImode); + emit_insn (gen_rtx_SET (operands[0], ret)); + DONE; + +}) + +(define_expand "reduc_sbool_ior_scal_<mode>" + [(match_operand:QI 0 "register_operand") + (match_operand:VI_AVX 1 "register_operand")] + "TARGET_SSE4_1" +{ + rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG); + rtx tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec(2, operands[1], operands[1]), UNSPEC_PTEST); + emit_insn (gen_rtx_SET (flags, tmp)); + rtx ret = gen_rtx_fmt_ee (NE, VOIDmode, flags, const0_rtx); + PUT_MODE (ret, QImode); + emit_insn (gen_rtx_SET (operands[0], ret)); + DONE; +}) + +(define_expand "reduc_sbool_xor_scal_<mode>" + [(match_operand:QI 0 "register_operand") + (match_operand:VI1_AVX2 1 "register_operand")] + "TARGET_SSE2 && TARGET_POPCNT" +{ + rtx popcnt1 = gen_reg_rtx (SImode); + emit_insn (gen_<sse2_avx2>_pmovmskb (popcnt1,operands[1])); + + emit_insn (gen_popcountsi2 (popcnt1, popcnt1)); + emit_insn (gen_andsi3 (popcnt1, popcnt1, GEN_INT (0x1))); + + emit_move_insn (operands[0], gen_lowpart (QImode, popcnt1)); + DONE; +}) + +(define_mode_attr ssefltvecmode + [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")]) + +(define_expand "reduc_sbool_xor_scal_<mode>" + [(match_operand:QI 0 "register_operand") + (match_operand:VI48_AVX 1 "register_operand")] + "TARGET_SSE2 && TARGET_POPCNT" +{ + rtx popcnt1 = gen_reg_rtx (SImode); + rtx tmp = gen_rtx_UNSPEC (SImode, gen_rtvec(1, + gen_lowpart (<ssefltvecmode>mode, + operands[1])), + UNSPEC_MOVMSK); + emit_insn (gen_rtx_SET (popcnt1, tmp)); + + emit_insn (gen_popcountsi2 (popcnt1, popcnt1)); + emit_insn (gen_andsi3 (popcnt1, popcnt1, GEN_INT (0x1))); + + emit_move_insn (operands[0], gen_lowpart (QImode, popcnt1)); + DONE; +}) + (define_insn "<mask_codefor>reducep<mode><mask_name><round_saeonly_name>" [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v") (unspec:VFH_AVX512VL @@ -4632,6 +4809,33 @@ UNSPEC_PCMP_ITER))] "operands[4] = GEN_INT (INTVAL (operands[3]) ^ 4);") +(define_insn_and_split "*<avx512>_cmp<mode>3_dup_op" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand") + (unspec:<avx512fmaskmode> + [(match_operand:VI1248_AVX512VLBW 1 "general_operand") + (match_operand:VI1248_AVX512VLBW 2 "general_operand") + (match_operand:SI 3 "<cmp_imm_predicate>")] + UNSPEC_PCMP_ITER))] + "TARGET_AVX512F && ix86_pre_reload_split () + && rtx_equal_p (operands[1], operands[2])" + "#" + "&& 1" + [(set (match_dup 0) (match_dup 4))] +{ + int cmp_imm = INTVAL (operands[3]); + rtx res = CONST0_RTX (<avx512fmaskmode>mode); + /* EQ/LE/NLT. */ + if (cmp_imm == 0 || cmp_imm == 2 || cmp_imm == 5) + { + int nelts = GET_MODE_NUNITS (<MODE>mode); + if (nelts >= 8) + res = CONSTM1_RTX (<avx512fmaskmode>mode); + else + res = gen_int_mode ((1u << nelts) - 1, QImode); + } + operands[4] = res; +}) + (define_insn "*<avx512>_eq<mode>3<mask_scalar_merge_name>_1" [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k,k") (unspec:<avx512fmaskmode> @@ -17975,6 +18179,24 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) +(define_insn_and_split "*eq<mode>3_2_negate" + [(set (match_operand:VI_AVX2_CMP 0 "register_operand") + (eq:VI_AVX2_CMP + (eq:VI_AVX2_CMP + (eq: VI_AVX2_CMP + (match_operand:VI_AVX2_CMP 1 "nonimmediate_operand") + (match_operand:VI_AVX2_CMP 2 "general_operand")) + (match_operand:VI_AVX2_CMP 3 "const0_operand")) + (match_operand:VI_AVX2_CMP 4 "const0_operand")))] + "TARGET_SSE4_1 && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (eq:VI_AVX2_CMP (match_dup 1) + (match_dup 5)))] + "operands[5] = force_reg (<MODE>mode, operands[2]);") + + (define_insn_and_split "*avx2_pcmp<mode>3_1" [(set (match_operand:VI_128_256 0 "register_operand") (vec_merge:VI_128_256 @@ -23665,9 +23887,6 @@ (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "<MODE>")]) -(define_mode_attr ssefltvecmode - [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")]) - (define_insn_and_split "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint" [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x") (unspec:<ssebytemode> @@ -25482,6 +25701,36 @@ (match_dup 0) (pc)))]) + +;; (unspec:ccz [(eq (eq op0 const0) const0)] unspec_ptest) +;; is equal to (unspec:ccz [op0 op0] unspec_ptest). +(define_insn_and_split "*ptest<mode>_ccz" + [(set (reg:CCZ FLAGS_REG) + (unspec:CCZ + [(eq:VI_AVX + (eq:VI_AVX + (match_operand:VI_AVX 0 "vector_operand") + (match_operand:VI_AVX 1 "const0_operand")) + (match_operand:VI_AVX 2 "const0_operand")) + (eq:VI_AVX + (eq:VI_AVX (match_dup 0) (match_dup 1)) + (match_dup 2))] + UNSPEC_PTEST))] + "TARGET_SSE4_1 + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (reg:CCZ FLAGS_REG) + (unspec:CCZ + [(match_dup 3) (match_dup 3)] + UNSPEC_PTEST))] +{ + if (MEM_P (operands[0])) + operands[3] = force_reg (<MODE>mode, operands[0]); + else + operands[3] = operands[0]; +}) + (define_expand "nearbyint<mode>2" [(set (match_operand:VFH 0 "register_operand") (unspec:VFH |